Revert " [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432)"
[llvm-project.git] / llvm / lib / Target / NVPTX / NVPTXIntrinsics.td
blobc81dfa68e4bd44b308deeda1e4e274378374f61b
1 //===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 def immFloat0 : PatLeaf<(fpimm), [{
10     float f = (float)N->getValueAPF().convertToFloat();
11     return (f==0.0f);
12 }]>;
14 def immFloat1 : PatLeaf<(fpimm), [{
15     float f = (float)N->getValueAPF().convertToFloat();
16     return (f==1.0f);
17 }]>;
19 def immDouble0 : PatLeaf<(fpimm), [{
20     double d = (double)N->getValueAPF().convertToDouble();
21     return (d==0.0);
22 }]>;
24 def immDouble1 : PatLeaf<(fpimm), [{
25     double d = (double)N->getValueAPF().convertToDouble();
26     return (d==1.0);
27 }]>;
29 def AS_match {
30   code generic = [{
31    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
32   }];
33   code shared = [{
34    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
35   }];
36   code global = [{
37    return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
38   }];
41 // A node that will be replaced with the current PTX version.
42 class PTX {
43   SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
44     return getI32Imm(Subtarget->getPTXVersion(), SDLoc(N));
45   }]>;
46   // (i32 0) will be XForm'ed to the currently used PTX version.
47   dag version = (PTXVerXform (i32 0));
49 def ptx : PTX;
51 // Generates list of n sequential register names.
52 // E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
53 class RegSeq<int n, string prefix> {
54   list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
55                                         [prefix # !sub(n, 1)]),
56                             []);
59 class THREADMASK_INFO<bit sync> {
60   list<bit> ret = !if(sync, [0, 1], [0]);
63 //-----------------------------------
64 // Synchronization and shuffle functions
65 //-----------------------------------
66 let isConvergent = true in {
67 def INT_BARRIER0 : NVPTXInst<(outs), (ins),
68                   "bar.sync \t0;",
69       [(int_nvvm_barrier0)]>;
70 def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
71                   "bar.sync \t$src1;",
72       [(int_nvvm_barrier_n Int32Regs:$src1)]>;
73 def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
74                   "bar.sync \t$src1, $src2;",
75       [(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>;
76 def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
77   !strconcat("{{ \n\t",
78              ".reg .pred \t%p1; \n\t",
79              "setp.ne.u32 \t%p1, $pred, 0; \n\t",
80              "bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
81              "}}"),
82       [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
83 def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
84   !strconcat("{{ \n\t",
85              ".reg .pred \t%p1; \n\t",
86              ".reg .pred \t%p2; \n\t",
87              "setp.ne.u32 \t%p1, $pred, 0; \n\t",
88              "bar.red.and.pred \t%p2, 0, %p1; \n\t",
89              "selp.u32 \t$dst, 1, 0, %p2; \n\t",
90              "}}"),
91       [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
92 def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
93   !strconcat("{{ \n\t",
94              ".reg .pred \t%p1; \n\t",
95              ".reg .pred \t%p2; \n\t",
96              "setp.ne.u32 \t%p1, $pred, 0; \n\t",
97              "bar.red.or.pred \t%p2, 0, %p1; \n\t",
98              "selp.u32 \t$dst, 1, 0, %p2; \n\t",
99              "}}"),
100       [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
102 def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
103                              [(int_nvvm_bar_sync imm:$i)]>;
105 def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
106                              [(int_nvvm_bar_warp_sync imm:$i)]>,
107         Requires<[hasPTX<60>, hasSM<30>]>;
108 def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
109                              [(int_nvvm_bar_warp_sync Int32Regs:$i)]>,
110         Requires<[hasPTX<60>, hasSM<30>]>;
112 def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;",
113                                    [(int_nvvm_barrier_sync imm:$i)]>,
114         Requires<[hasPTX<60>, hasSM<30>]>;
115 def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;",
116                                    [(int_nvvm_barrier_sync Int32Regs:$i)]>,
117         Requires<[hasPTX<60>, hasSM<30>]>;
119 def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt),
120                  "barrier.sync \t$id, $cnt;",
121                  [(int_nvvm_barrier_sync_cnt Int32Regs:$id, Int32Regs:$cnt)]>,
122         Requires<[hasPTX<60>, hasSM<30>]>;
123 def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt),
124                  "barrier.sync \t$id, $cnt;",
125                  [(int_nvvm_barrier_sync_cnt Int32Regs:$id, imm:$cnt)]>,
126         Requires<[hasPTX<60>, hasSM<30>]>;
127 def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt),
128                  "barrier.sync \t$id, $cnt;",
129                  [(int_nvvm_barrier_sync_cnt imm:$id, Int32Regs:$cnt)]>,
130         Requires<[hasPTX<60>, hasSM<30>]>;
131 def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
132                  "barrier.sync \t$id, $cnt;",
133                  [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
134         Requires<[hasPTX<60>, hasSM<30>]>;
136 class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
137                           list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
138         NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
139         Requires<Preds>;
141 def barrier_cluster_arrive:
142         INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>;
143 def barrier_cluster_arrive_relaxed:
144         INT_BARRIER_CLUSTER<"arrive.relaxed",
145         int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>;
146 def barrier_cluster_wait:
147         INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>;
149 // 'aligned' versions of the cluster barrier intrinsics
150 def barrier_cluster_arrive_aligned:
151         INT_BARRIER_CLUSTER<"arrive.aligned", int_nvvm_barrier_cluster_arrive_aligned>;
152 def barrier_cluster_arrive_relaxed_aligned:
153         INT_BARRIER_CLUSTER<"arrive.relaxed.aligned",
154         int_nvvm_barrier_cluster_arrive_relaxed_aligned, [hasPTX<80>, hasSM<90>]>;
155 def barrier_cluster_wait_aligned:
156         INT_BARRIER_CLUSTER<"wait.aligned", int_nvvm_barrier_cluster_wait_aligned>;
158 class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
159                  bit offset_imm, bit mask_imm, bit threadmask_imm>
160       : NVPTXInst<(outs), (ins), "?", []> {
161   NVPTXRegClass rc = !cond(
162     !eq(reg, "i32"): Int32Regs,
163     !eq(reg, "f32"): Float32Regs);
164   string IntrName = "int_nvvm_shfl_"
165                     # !if(sync, "sync_", "")
166                     # mode
167                     # "_" # reg
168                     # !if(return_pred, "p", "");
169   Intrinsic Intr = !cast<Intrinsic>(IntrName);
170   let InOperandList = !con(
171     !if(sync,
172         !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
173         (ins)),
174     (ins rc:$src),
175     !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
176     !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
177     );
178   let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
179   let AsmString = "shfl."
180      # !if(sync, "sync.", "")
181      # mode # ".b32\t"
182      # "$dst"
183      # !if(return_pred, "|$pred", "") # ", "
184      # "$src, $offset, $mask"
185      # !if(sync, ", $threadmask", "")
186      # ";"
187      ;
188   let Pattern = [!con(
189       !foreach(tmp, OutOperandList,
190              !subst(outs, set,
191              !subst(i32imm, imm, tmp))),
192       (set !foreach(tmp, InOperandList,
193              !subst(ins, Intr,
194              !subst(i32imm, imm, tmp))))
195   )];
198 foreach sync = [false, true] in {
199   foreach mode = ["up", "down", "bfly", "idx"] in {
200     foreach regclass = ["i32", "f32"] in {
201       foreach return_pred = [false, true] in {
202         foreach offset_imm = [false, true] in {
203           foreach mask_imm = [false, true] in {
204             foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
205               def : SHFL_INSTR<sync, mode, regclass, return_pred,
206                                offset_imm, mask_imm, threadmask_imm>,
207                     Requires<!if(sync, [hasSM<30>, hasPTX<60>], [hasSM<30>, hasSHFL])>;
208             }
209           }
210         }
211       }
212     }
213   }
216 // vote.{all,any,uni,ballot}
217 multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
218   def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
219               "vote." # mode # " \t$dest, $pred;",
220               [(set regclass:$dest, (IntOp Int1Regs:$pred))]>,
221         Requires<[hasPTX<60>, hasSM<30>]>;
224 defm VOTE_ALL : VOTE<Int1Regs, "all.pred", int_nvvm_vote_all>;
225 defm VOTE_ANY : VOTE<Int1Regs, "any.pred", int_nvvm_vote_any>;
226 defm VOTE_UNI : VOTE<Int1Regs, "uni.pred", int_nvvm_vote_uni>;
227 defm VOTE_BALLOT : VOTE<Int32Regs, "ballot.b32", int_nvvm_vote_ballot>;
229 // vote.sync.{all,any,uni,ballot}
230 multiclass VOTE_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
231   def i : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, Int1Regs:$pred),
232               "vote.sync." # mode # " \t$dest, $pred, $mask;",
233               [(set regclass:$dest, (IntOp imm:$mask, Int1Regs:$pred))]>,
234           Requires<[hasPTX<60>, hasSM<30>]>;
235   def r : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, Int1Regs:$pred),
236               "vote.sync." # mode #" \t$dest, $pred, $mask;",
237               [(set regclass:$dest, (IntOp Int32Regs:$mask, Int1Regs:$pred))]>,
238           Requires<[hasPTX<60>, hasSM<30>]>;
241 defm VOTE_SYNC_ALL : VOTE_SYNC<Int1Regs, "all.pred", int_nvvm_vote_all_sync>;
242 defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>;
243 defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
244 defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
246 multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
247                           Operand ImmOp> {
248   def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),
249               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
250               [(set Int32Regs:$dest, (IntOp imm:$mask, imm:$value))]>,
251            Requires<[hasPTX<60>, hasSM<70>]>;
252   def ir : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, ImmOp:$value),
253               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
254               [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, imm:$value))]>,
255            Requires<[hasPTX<60>, hasSM<70>]>;
256   def ri : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, regclass:$value),
257               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
258               [(set Int32Regs:$dest, (IntOp imm:$mask, regclass:$value))]>,
259            Requires<[hasPTX<60>, hasSM<70>]>;
260   def rr : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, regclass:$value),
261               "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
262               [(set Int32Regs:$dest, (IntOp Int32Regs:$mask, regclass:$value))]>,
263            Requires<[hasPTX<60>, hasSM<70>]>;
266 // activemask.b32
267 def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
268                     "activemask.b32 \t$dest;",
269                     [(set Int32Regs:$dest, (int_nvvm_activemask))]>,
270                  Requires<[hasPTX<62>, hasSM<30>]>;
272 defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32,
273                                         i32imm>;
274 defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64,
275                                         i64imm>;
277 multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
278                           Operand ImmOp> {
279   def ii : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
280                      (ins i32imm:$mask, ImmOp:$value),
281               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
282               [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, imm:$value))]>,
283            Requires<[hasPTX<60>, hasSM<70>]>;
284   def ir : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
285                      (ins Int32Regs:$mask, ImmOp:$value),
286               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
287               [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, imm:$value))]>,
288            Requires<[hasPTX<60>, hasSM<70>]>;
289   def ri : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
290                      (ins i32imm:$mask, regclass:$value),
291               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
292               [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp imm:$mask, regclass:$value))]>,
293            Requires<[hasPTX<60>, hasSM<70>]>;
294   def rr : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
295                      (ins Int32Regs:$mask, regclass:$value),
296               "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
297               [(set Int32Regs:$dest, Int1Regs:$pred, (IntOp Int32Regs:$mask, regclass:$value))]>,
298            Requires<[hasPTX<60>, hasSM<70>]>;
300 defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_sync_i32p,
301                                          i32imm>;
302 defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
303                                          i64imm>;
305 multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
306   def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
307           "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
308           [(set Int32Regs:$dst, (Intrin Int32Regs:$src, Int32Regs:$mask))]>,
309         Requires<[hasPTX<70>, hasSM<80>]>;
312 defm REDUX_SYNC_UMIN : REDUX_SYNC<"min", "u32", int_nvvm_redux_sync_umin>;
313 defm REDUX_SYNC_UMAX : REDUX_SYNC<"max", "u32", int_nvvm_redux_sync_umax>;
314 defm REDUX_SYNC_ADD : REDUX_SYNC<"add", "s32", int_nvvm_redux_sync_add>;
315 defm REDUX_SYNC_MIN : REDUX_SYNC<"min", "s32", int_nvvm_redux_sync_min>;
316 defm REDUX_SYNC_MAX : REDUX_SYNC<"max", "s32", int_nvvm_redux_sync_max>;
317 defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and>;
318 defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>;
319 defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
321 } // isConvergent = true
323 //-----------------------------------
324 // Explicit Memory Fence Functions
325 //-----------------------------------
326 class MEMBAR<string StrOp, Intrinsic IntOP> :
327               NVPTXInst<(outs), (ins),
328             StrOp, [(IntOP)]>;
330 def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
331 def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
332 def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
334 def INT_FENCE_SC_CLUSTER:
335        MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
336        Requires<[hasPTX<78>, hasSM<90>]>;
338 //-----------------------------------
339 // Async Copy Functions
340 //-----------------------------------
342 multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
343   def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
344             !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
345             [(Intrin Int32Regs:$addr)]>,
346     Requires<[hasPTX<70>, hasSM<80>]>;
347   def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
348             !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
349             [(Intrin Int64Regs:$addr)]>,
350     Requires<[hasPTX<70>, hasSM<80>]>;
353 defm CP_ASYNC_MBARRIER_ARRIVE :
354   CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
355 defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
356   CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
357 defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
358   CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
359 defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
360   CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
362 multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin, Intrinsic IntrinS> {
363   def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
364             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
365             [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
366     Requires<[hasPTX<70>, hasSM<80>]>;
367   def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
368             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
369             [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
370     Requires<[hasPTX<70>, hasSM<80>]>;
371   // Variant with src_size parameter
372   def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
373              !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
374              [(IntrinS Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>,
375     Requires<[hasPTX<70>, hasSM<80>]>;
376   def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
377              !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
378              [(IntrinS Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>,
379     Requires<[hasPTX<70>, hasSM<80>]>;
380   def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
381              !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
382              [(IntrinS Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>,
383     Requires<[hasPTX<70>, hasSM<80>]>;
384   def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
385              !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
386              [(IntrinS Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>,
387     Requires<[hasPTX<70>, hasSM<80>]>;
390 defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
391   CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4,
392                                       int_nvvm_cp_async_ca_shared_global_4_s>;
394 defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
395   CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8,
396                                       int_nvvm_cp_async_ca_shared_global_8_s>;
398 defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
399   CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16,
400                                        int_nvvm_cp_async_ca_shared_global_16_s>;
402 defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
403   CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16,
404                                        int_nvvm_cp_async_cg_shared_global_16_s>;
406 def CP_ASYNC_COMMIT_GROUP :
407   NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
408   Requires<[hasPTX<70>, hasSM<80>]>;
410 def CP_ASYNC_WAIT_GROUP :
411   NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
412   [(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
413   Requires<[hasPTX<70>, hasSM<80>]>;
415 def CP_ASYNC_WAIT_ALL :
416   NVPTXInst<(outs), (ins), "cp.async.wait_all;",
417   [(int_nvvm_cp_async_wait_all)]>,
418   Requires<[hasPTX<70>, hasSM<80>]>;
420 // cp.async.bulk variants of the commit/wait group
421 def CP_ASYNC_BULK_COMMIT_GROUP :
422   NVPTXInst<(outs), (ins), "cp.async.bulk.commit_group;",
423   [(int_nvvm_cp_async_bulk_commit_group)]>,
424   Requires<[hasPTX<80>, hasSM<90>]>;
426 def CP_ASYNC_BULK_WAIT_GROUP :
427   NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group $n;",
428   [(int_nvvm_cp_async_bulk_wait_group (i32 timm:$n))]>,
429   Requires<[hasPTX<80>, hasSM<90>]>;
431 def CP_ASYNC_BULK_WAIT_GROUP_READ :
432   NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read $n;",
433   [(int_nvvm_cp_async_bulk_wait_group_read (i32 timm:$n))]>,
434   Requires<[hasPTX<80>, hasSM<90>]>;
436 //-----------------------------------
437 // MBarrier Functions
438 //-----------------------------------
440 multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
441   def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
442            !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
443     [(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
444     Requires<[hasPTX<70>, hasSM<80>]>;
445   def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
446            !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
447     [(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
448     Requires<[hasPTX<70>, hasSM<80>]>;
451 defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
452 defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
453                                           int_nvvm_mbarrier_init_shared>;
455 multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
456   def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
457            !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
458     [(Intrin Int32Regs:$addr)]>,
459     Requires<[hasPTX<70>, hasSM<80>]>;
460   def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
461            !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
462     [(Intrin Int64Regs:$addr)]>,
463     Requires<[hasPTX<70>, hasSM<80>]>;
466 defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
467 defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
468                                             int_nvvm_mbarrier_inval_shared>;
470 multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
471   def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
472            !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
473     [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
474     Requires<[hasPTX<70>, hasSM<80>]>;
475   def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
476            !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
477     [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
478     Requires<[hasPTX<70>, hasSM<80>]>;
481 defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
482 defm MBARRIER_ARRIVE_SHARED :
483   MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
485 multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
486   def _32 : NVPTXInst<(outs Int64Regs:$state),
487            (ins Int32Regs:$addr, Int32Regs:$count),
488            !strconcat("mbarrier.arrive.noComplete", AddrSpace,
489                       ".b64 $state, [$addr], $count;"),
490     [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
491     Requires<[hasPTX<70>, hasSM<80>]>;
492   def _64 : NVPTXInst<(outs Int64Regs:$state),
493            (ins Int64Regs:$addr, Int32Regs:$count),
494            !strconcat("mbarrier.arrive.noComplete", AddrSpace,
495                       ".b64 $state, [$addr], $count;"),
496     [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
497     Requires<[hasPTX<70>, hasSM<80>]>;
500 defm MBARRIER_ARRIVE_NOCOMPLETE :
501   MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
502 defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
503   MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
505 multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
506   def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
507            !strconcat("mbarrier.arrive_drop", AddrSpace,
508                       ".b64 $state, [$addr];"),
509            [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
510     Requires<[hasPTX<70>, hasSM<80>]>;
511   def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
512            !strconcat("mbarrier.arrive_drop", AddrSpace,
513                       ".b64 $state, [$addr];"),
514            [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
515     Requires<[hasPTX<70>, hasSM<80>]>;
518 defm MBARRIER_ARRIVE_DROP :
519   MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
520 defm MBARRIER_ARRIVE_DROP_SHARED :
521   MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
523 multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
524   def _32 : NVPTXInst<(outs Int64Regs:$state),
525            (ins Int32Regs:$addr, Int32Regs:$count),
526            !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
527                       ".b64 $state, [$addr], $count;"),
528            [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
529     Requires<[hasPTX<70>, hasSM<80>]>;
530   def _64 : NVPTXInst<(outs Int64Regs:$state),
531            (ins Int64Regs:$addr, Int32Regs:$count),
532            !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
533                       ".b64 $state, [$addr], $count;"),
534            [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
535     Requires<[hasPTX<70>, hasSM<80>]>;
538 defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
539   MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
540 defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
541   MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
542                        int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
544 multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
545   def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
546            !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
547            [(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
548     Requires<[hasPTX<70>, hasSM<80>]>;
549   def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
550            !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
551            [(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
552     Requires<[hasPTX<70>, hasSM<80>]>;
555 defm MBARRIER_TEST_WAIT :
556   MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
557 defm MBARRIER_TEST_WAIT_SHARED :
558   MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
560 class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
561            NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
562            "mbarrier.pending_count.b64 $res, $state;",
563            [(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
564     Requires<[hasPTX<70>, hasSM<80>]>;
566 def MBARRIER_PENDING_COUNT :
567   MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
569 //-----------------------------------
570 // Math Functions
571 //-----------------------------------
573 // Map min(1.0, max(0.0, x)) to sat(x)
574 // Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
575 // NaN
576 // max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
577 // Same story for fmax, fmin.
579 def : Pat<(int_nvvm_fmin_f immFloat1,
580             (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
581           (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
582 def : Pat<(int_nvvm_fmin_f immFloat1,
583             (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
584           (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
585 def : Pat<(int_nvvm_fmin_f
586             (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
587           (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
588 def : Pat<(int_nvvm_fmin_f
589             (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
590           (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
592 def : Pat<(int_nvvm_fmin_d immDouble1,
593             (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
594           (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
595 def : Pat<(int_nvvm_fmin_d immDouble1,
596             (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
597           (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
598 def : Pat<(int_nvvm_fmin_d
599             (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
600           (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
601 def : Pat<(int_nvvm_fmin_d
602             (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
603           (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
606 // We need a full string for OpcStr here because we need to deal with case like
607 // INT_PTX_RECIP.
608 class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
609   NVPTXRegClass src_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
610             : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
611             OpcStr,
612         [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>,
613         Requires<Preds>;
615 // We need a full string for OpcStr here because we need to deal with the case
616 // like INT_PTX_NATIVE_POWR_F.
617 class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
618   NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP,
619   list<Predicate> Preds = []>
620             : NVPTXInst<(outs t_regclass:$dst),
621               (ins s0_regclass:$src0, s1_regclass:$src1),
622             OpcStr,
623         [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>,
624         Requires<Preds>;
626 class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
627   NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
628   NVPTXRegClass s2_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
629             : NVPTXInst<(outs t_regclass:$dst),
630               (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
631             OpcStr,
632         [(set t_regclass:$dst,
633           (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>,
634           Requires<Preds>;
637 // MISC
640 def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
641   Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
643 def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32 \t$i;",
644                              [(int_nvvm_nanosleep imm:$i)]>,
645         Requires<[hasPTX<63>, hasSM<70>]>;
646 def INT_NVVM_NANOSLEEP_R : NVPTXInst<(outs), (ins Int32Regs:$i), "nanosleep.u32 \t$i;",
647                              [(int_nvvm_nanosleep Int32Regs:$i)]>,
648         Requires<[hasPTX<63>, hasSM<70>]>;
650 // Min Max
653 def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
654   Float32Regs, Float32Regs, int_nvvm_fmin_f>;
655 def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
656   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
657 def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;",
658   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f,
659   [hasPTX<70>, hasSM<80>]>;
660 def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;",
661   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f,
662   [hasPTX<70>, hasSM<80>]>;
663 def INT_NVVM_FMIN_XORSIGN_ABS_F :
664   F_MATH_2<"min.xorsign.abs.f32 \t$dst, $src0, $src1;",
665     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_xorsign_abs_f,
666     [hasPTX<72>, hasSM<86>]>;
667 def INT_NVVM_FMIN_FTZ_XORSIGN_ABS_F :
668   F_MATH_2<"min.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
669     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_xorsign_abs_f,
670     [hasPTX<72>, hasSM<86>]>;
671 def INT_NVVM_FMIN_NAN_XORSIGN_ABS_F :
672   F_MATH_2<"min.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
673     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_xorsign_abs_f,
674     [hasPTX<72>, hasSM<86>]>;
675 def INT_NVVM_FMIN_FTZ_NAN_XORSIGN_ABS_F :
676   F_MATH_2<"min.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
677     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_xorsign_abs_f,
678     [hasPTX<72>, hasSM<86>]>;
680 def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
681   Float32Regs, Float32Regs, int_nvvm_fmax_f>;
682 def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
683   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
684 def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;",
685   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f,
686   [hasPTX<70>, hasSM<80>]>;
687 def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;",
688   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f,
689   [hasPTX<70>, hasSM<80>]>;
690 def INT_NVVM_FMAX_XORSIGN_ABS_F :
691   F_MATH_2<"max.xorsign.abs.f32 \t$dst, $src0, $src1;",
692     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_xorsign_abs_f,
693     [hasPTX<72>, hasSM<86>]>;
694 def INT_NVVM_FMAX_FTZ_XORSIGN_ABS_F :
695   F_MATH_2<"max.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
696     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_xorsign_abs_f,
697     [hasPTX<72>, hasSM<86>]>;
698 def INT_NVVM_FMAX_NAN_XORSIGN_ABS_F :
699   F_MATH_2<"max.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
700     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_xorsign_abs_f,
701     [hasPTX<72>, hasSM<86>]>;
702 def INT_NVVM_FMAX_FTZ_NAN_XORSIGN_ABS_F :
703   F_MATH_2<"max.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
704     Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_xorsign_abs_f,
705     [hasPTX<72>, hasSM<86>]>;
707 def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
708   Float64Regs, Float64Regs, int_nvvm_fmin_d>;
709 def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
710   Float64Regs, Float64Regs, int_nvvm_fmax_d>;
713 // Min Max f16, f16x2, bf16, bf16x2
716 class MIN_MAX_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
717                     list<Predicate> Preds = [hasPTX<70>, hasSM<80>]> {
718   string Variant = V;
719   Intrinsic Intr = I;
720   NVPTXRegClass RegClass = RC;
721   list<Predicate> Predicates = Preds;
724 multiclass MIN_MAX<string IntName> {
725   foreach P = [
726     MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16,
727       int_nvvm_fmax_f16), Int16Regs>,
728     MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16,
729       int_nvvm_fmax_ftz_f16), Int16Regs>,
730     MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16,
731       int_nvvm_fmax_nan_f16), Int16Regs>,
732     MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"),
733       int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Int16Regs>,
734     MIN_MAX_TUPLE<"_xorsign_abs_f16", !if(!eq(IntName, "min"),
735       int_nvvm_fmin_xorsign_abs_f16, int_nvvm_fmax_xorsign_abs_f16),
736       Int16Regs, [hasPTX<72>, hasSM<86>]>,
737     MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16", !if(!eq(IntName, "min"),
738       int_nvvm_fmin_ftz_xorsign_abs_f16, int_nvvm_fmax_ftz_xorsign_abs_f16),
739       Int16Regs, [hasPTX<72>, hasSM<86>]>,
740     MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
741       int_nvvm_fmin_nan_xorsign_abs_f16, int_nvvm_fmax_nan_xorsign_abs_f16),
742       Int16Regs, [hasPTX<72>, hasSM<86>]>,
743     MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
744       int_nvvm_fmin_ftz_nan_xorsign_abs_f16,
745       int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Int16Regs, [hasPTX<72>, hasSM<86>]>,
746     MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2,
747       int_nvvm_fmax_f16x2), Int32Regs>,
748     MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"),
749       int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Int32Regs>,
750     MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"),
751       int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Int32Regs>,
752     MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"),
753       int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Int32Regs>,
754     MIN_MAX_TUPLE<"_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
755       int_nvvm_fmin_xorsign_abs_f16x2, int_nvvm_fmax_xorsign_abs_f16x2),
756       Int32Regs, [hasPTX<72>, hasSM<86>]>,
757     MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
758       int_nvvm_fmin_ftz_xorsign_abs_f16x2, int_nvvm_fmax_ftz_xorsign_abs_f16x2),
759       Int32Regs, [hasPTX<72>, hasSM<86>]>,
760     MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
761       int_nvvm_fmin_nan_xorsign_abs_f16x2, int_nvvm_fmax_nan_xorsign_abs_f16x2),
762       Int32Regs, [hasPTX<72>, hasSM<86>]>,
763     MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
764       int_nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
765       int_nvvm_fmax_ftz_nan_xorsign_abs_f16x2),
766       Int32Regs, [hasPTX<72>, hasSM<86>]>,
767     MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"),
768       int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>,
769     MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16,
770       int_nvvm_fmax_nan_bf16), Int16Regs>,
771     MIN_MAX_TUPLE<"_xorsign_abs_bf16", !if(!eq(IntName, "min"),
772       int_nvvm_fmin_xorsign_abs_bf16, int_nvvm_fmax_xorsign_abs_bf16),
773       Int16Regs, [hasPTX<72>, hasSM<86>]>,
774     MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16", !if(!eq(IntName, "min"),
775       int_nvvm_fmin_nan_xorsign_abs_bf16, int_nvvm_fmax_nan_xorsign_abs_bf16),
776       Int16Regs, [hasPTX<72>, hasSM<86>]>,
777     MIN_MAX_TUPLE<"_bf16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_bf16x2,
778       int_nvvm_fmax_bf16x2), Int32Regs>,
779     MIN_MAX_TUPLE<"_NaN_bf16x2", !if(!eq(IntName, "min"),
780       int_nvvm_fmin_nan_bf16x2, int_nvvm_fmax_nan_bf16x2), Int32Regs>,
781     MIN_MAX_TUPLE<"_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
782       int_nvvm_fmin_xorsign_abs_bf16x2, int_nvvm_fmax_xorsign_abs_bf16x2),
783       Int32Regs, [hasPTX<72>, hasSM<86>]>,
784     MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
785       int_nvvm_fmin_nan_xorsign_abs_bf16x2,
786       int_nvvm_fmax_nan_xorsign_abs_bf16x2),
787       Int32Regs, [hasPTX<72>, hasSM<86>]>] in {
788         def P.Variant : F_MATH_2<!strconcat(
789           IntName, !subst("_", ".", P.Variant), " \t$dst, $src0, $src1;"),
790           P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
791   }
794 defm INT_NVVM_FMIN : MIN_MAX<"min">;
795 defm INT_NVVM_FMAN : MIN_MAX<"max">;
798 // Multiplication
801 def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16 \t$dst, $src0, $src1;", Int16Regs,
802   Int16Regs, Int16Regs, int_nvvm_mulhi_s>;
803 def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16 \t$dst, $src0, $src1;", Int16Regs,
804   Int16Regs, Int16Regs, int_nvvm_mulhi_us>;
805 def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
806   Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
807 def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
808   Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
809 def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
810   Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
811 def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
812   Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
814 def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
815   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
816 def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
817   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
818 def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
819   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
820 def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
821   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
822 def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
823   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
824 def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
825   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
826 def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
827   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
828 def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
829   Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
831 def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
832   Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
833 def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
834   Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
835 def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
836   Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
837 def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
838   Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
840 def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
841   Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
842 def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
843   Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
846 // Div
849 def INT_NVVM_DIV_APPROX_FTZ_F
850   : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
851     Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
852 def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
853   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
855 def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
856   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
857 def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
858   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
859 def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
860   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
861 def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
862   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
863 def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
864   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
865 def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
866   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
867 def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
868   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
869 def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
870   Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
872 def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
873   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
874 def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
875   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
876 def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
877   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
878 def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
879   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
882 // Sad
885 def INT_NVVM_SAD_S : F_MATH_3<"sad.s16 \t$dst, $src0, $src1, $src2;",
886   Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_s>;
887 def INT_NVVM_SAD_US : F_MATH_3<"sad.u16 \t$dst, $src0, $src1, $src2;",
888   Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_us>;
889 def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
890   Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
891 def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
892   Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
893 def INT_NVVM_SAD_LL : F_MATH_3<"sad.s64 \t$dst, $src0, $src1, $src2;",
894   Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ll>;
895 def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64 \t$dst, $src0, $src1, $src2;",
896   Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ull>;
899 // Floor  Ceil
902 def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
903           (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
904 def : Pat<(int_nvvm_floor_f Float32Regs:$a),
905           (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
906 def : Pat<(int_nvvm_floor_d Float64Regs:$a),
907           (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
909 def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
910           (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
911 def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
912           (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
913 def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
914           (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
917 // Abs
920 def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
921   Float32Regs, int_nvvm_fabs_ftz_f>;
922 def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
923   Float32Regs, int_nvvm_fabs_f>;
925 def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
926   Float64Regs, int_nvvm_fabs_d>;
929 // Abs, Neg bf16, bf16x2
932 def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $src0;", Int16Regs,
933   Int16Regs, int_nvvm_abs_bf16, [hasPTX<70>, hasSM<80>]>;
934 def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $src0;", Int32Regs,
935   Int32Regs, int_nvvm_abs_bf16x2, [hasPTX<70>, hasSM<80>]>;
936 def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", Int16Regs,
937   Int16Regs, int_nvvm_neg_bf16, [hasPTX<70>, hasSM<80>]>;
938 def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
939   Int32Regs, int_nvvm_neg_bf16x2, [hasPTX<70>, hasSM<80>]>;
942 // Round
945 def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
946           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
947 def : Pat<(int_nvvm_round_f Float32Regs:$a),
948           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
949 def : Pat<(int_nvvm_round_d Float64Regs:$a),
950           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
953 // Trunc
956 def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
957           (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
958 def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
959           (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
960 def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
961           (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
964 // Saturate
967 def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
968           (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
969 def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
970           (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
971 def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
972           (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
975 // Exp2  Log2
978 def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
979   Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
980 def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
981   Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
982 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
983   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
984 def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
985   Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
986 def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
987   Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
989 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
990   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
991 def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
992   Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
993 def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
994   Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
997 // Sin  Cos
1000 def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
1001   Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
1002 def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
1003   Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
1005 def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
1006   Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
1007 def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
1008   Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
1011 // Fma
1014 class FMA_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
1015                 list<Predicate> Preds = []> {
1016   string Variant = V;
1017   Intrinsic Intr = I;
1018   NVPTXRegClass RegClass = RC;
1019   list<Predicate> Predicates = Preds;
1022 multiclass FMA_INST {
1023   foreach P = [
1024     FMA_TUPLE<"_rn_f64", int_nvvm_fma_rn_d, Float64Regs>,
1025     FMA_TUPLE<"_rz_f64", int_nvvm_fma_rz_d, Float64Regs>,
1026     FMA_TUPLE<"_rm_f64", int_nvvm_fma_rm_d, Float64Regs>,
1027     FMA_TUPLE<"_rp_f64", int_nvvm_fma_rp_d, Float64Regs>,
1029     FMA_TUPLE<"_rn_ftz_f32", int_nvvm_fma_rn_ftz_f, Float32Regs>,
1030     FMA_TUPLE<"_rn_f32", int_nvvm_fma_rn_f, Float32Regs>,
1031     FMA_TUPLE<"_rz_ftz_f32", int_nvvm_fma_rz_ftz_f, Float32Regs>,
1032     FMA_TUPLE<"_rz_f32", int_nvvm_fma_rz_f, Float32Regs>,
1033     FMA_TUPLE<"_rm_f32", int_nvvm_fma_rm_f, Float32Regs>,
1034     FMA_TUPLE<"_rm_ftz_f32", int_nvvm_fma_rm_ftz_f, Float32Regs>,
1035     FMA_TUPLE<"_rp_f32", int_nvvm_fma_rp_f, Float32Regs>,
1036     FMA_TUPLE<"_rp_ftz_f32", int_nvvm_fma_rp_ftz_f, Float32Regs>,
1038     FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, Int16Regs, [hasPTX<42>, hasSM<53>]>,
1039     FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, Int16Regs,
1040       [hasPTX<42>, hasSM<53>]>,
1041     FMA_TUPLE<"_rn_sat_f16", int_nvvm_fma_rn_sat_f16, Int16Regs,
1042       [hasPTX<42>, hasSM<53>]>,
1043     FMA_TUPLE<"_rn_ftz_sat_f16", int_nvvm_fma_rn_ftz_sat_f16, Int16Regs,
1044       [hasPTX<42>, hasSM<53>]>,
1045     FMA_TUPLE<"_rn_relu_f16", int_nvvm_fma_rn_relu_f16, Int16Regs,
1046       [hasPTX<70>, hasSM<80>]>,
1047     FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Int16Regs,
1048       [hasPTX<70>, hasSM<80>]>,
1050     FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, Int16Regs, [hasPTX<70>, hasSM<80>]>,
1051     FMA_TUPLE<"_rn_ftz_bf16", int_nvvm_fma_rn_ftz_bf16, Int16Regs,
1052       [hasPTX<70>, hasSM<80>]>,
1053     FMA_TUPLE<"_rn_sat_bf16", int_nvvm_fma_rn_sat_bf16, Int16Regs,
1054       [hasPTX<70>, hasSM<80>]>,
1055     FMA_TUPLE<"_rn_ftz_sat_bf16", int_nvvm_fma_rn_ftz_sat_bf16, Int16Regs,
1056       [hasPTX<70>, hasSM<80>]>,
1057     FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, Int16Regs,
1058       [hasPTX<70>, hasSM<80>]>,
1059     FMA_TUPLE<"_rn_ftz_relu_bf16", int_nvvm_fma_rn_ftz_relu_bf16, Int16Regs,
1060       [hasPTX<70>, hasSM<80>]>,
1062     FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Int32Regs,
1063       [hasPTX<42>, hasSM<53>]>,
1064     FMA_TUPLE<"_rn_ftz_f16x2", int_nvvm_fma_rn_ftz_f16x2, Int32Regs,
1065       [hasPTX<42>, hasSM<53>]>,
1066     FMA_TUPLE<"_rn_sat_f16x2", int_nvvm_fma_rn_sat_f16x2, Int32Regs,
1067       [hasPTX<42>, hasSM<53>]>,
1068     FMA_TUPLE<"_rn_ftz_sat_f16x2", int_nvvm_fma_rn_ftz_sat_f16x2,
1069       Int32Regs, [hasPTX<42>, hasSM<53>]>,
1070     FMA_TUPLE<"_rn_relu_f16x2", int_nvvm_fma_rn_relu_f16x2, Int32Regs,
1071       [hasPTX<70>, hasSM<80>]>,
1072     FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
1073       Int32Regs, [hasPTX<70>, hasSM<80>]>,
1074     FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, Int32Regs,
1075       [hasPTX<70>, hasSM<80>]>,
1076     FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, Int32Regs,
1077       [hasPTX<70>, hasSM<80>]>
1078   ] in {
1079     def P.Variant :
1080       F_MATH_3<!strconcat("fma",
1081         !subst("_", ".", P.Variant), " \t$dst, $src0, $src1, $src2;"),
1082         P.RegClass, P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
1083   }
1086 defm INT_NVVM_FMA : FMA_INST;
1089 // Rcp
1092 def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
1093   Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
1094 def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
1095   Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
1096 def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
1097   Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
1098 def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
1099   Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
1100 def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
1101   Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
1102 def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
1103   Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
1104 def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
1105   Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
1106 def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
1107   Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
1109 def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
1110   Float64Regs, int_nvvm_rcp_rn_d>;
1111 def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
1112   Float64Regs, int_nvvm_rcp_rz_d>;
1113 def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
1114   Float64Regs, int_nvvm_rcp_rm_d>;
1115 def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
1116   Float64Regs, int_nvvm_rcp_rp_d>;
1118 def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32 \t$dst, $src0;",
1119   Float32Regs, Float32Regs, int_nvvm_rcp_approx_ftz_f>;
1120 def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
1121   Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
1124 // Sqrt
1127 def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
1128   Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
1129 def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
1130   Float32Regs, int_nvvm_sqrt_rn_f>;
1131 def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
1132   Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
1133 def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
1134   Float32Regs, int_nvvm_sqrt_rz_f>;
1135 def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
1136   Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
1137 def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
1138   Float32Regs, int_nvvm_sqrt_rm_f>;
1139 def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
1140   Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
1141 def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
1142   Float32Regs, int_nvvm_sqrt_rp_f>;
1143 def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
1144   Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
1145 def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
1146   Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
1148 def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
1149   Float64Regs, int_nvvm_sqrt_rn_d>;
1150 def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
1151   Float64Regs, int_nvvm_sqrt_rz_d>;
1152 def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
1153   Float64Regs, int_nvvm_sqrt_rm_d>;
1154 def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
1155   Float64Regs, int_nvvm_sqrt_rp_d>;
1157 // nvvm_sqrt intrinsic
1158 def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
1159           (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
1160 def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
1161           (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
1162 def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
1163           (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
1164 def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
1165           (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
1168 // Rsqrt
1171 def INT_NVVM_RSQRT_APPROX_FTZ_F
1172   : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
1173     int_nvvm_rsqrt_approx_ftz_f>;
1174 def INT_NVVM_RSQRT_APPROX_FTZ_D
1175   : F_MATH_1<"rsqrt.approx.ftz.f64 \t$dst, $src0;", Float64Regs, Float64Regs,
1176     int_nvvm_rsqrt_approx_ftz_d>;
1178 def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
1179   Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
1180 def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
1181   Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
1183 // 1.0f / sqrt_approx -> rsqrt_approx
1184 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f Float32Regs:$a)),
1185          (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
1186          Requires<[doRsqrtOpt]>;
1187 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f Float32Regs:$a)),
1188          (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
1189          Requires<[doRsqrtOpt]>;
1190 // same for int_nvvm_sqrt_f when non-precision sqrt is requested
1191 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$a)),
1192          (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
1193          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
1194 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$a)),
1195          (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
1196          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
1198 def: Pat<(fdiv FloatConst1, (fsqrt Float32Regs:$a)),
1199          (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
1200          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
1201 def: Pat<(fdiv FloatConst1, (fsqrt Float32Regs:$a)),
1202          (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
1203          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
1205 // Add
1208 def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
1209   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
1210 def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
1211   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
1212 def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
1213   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
1214 def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
1215   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
1216 def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
1217   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
1218 def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
1219   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
1220 def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
1221   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
1222 def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
1223   Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
1225 def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
1226   Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
1227 def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
1228   Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
1229 def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
1230   Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
1231 def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
1232   Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
1235 // Convert
1238 def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
1239           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
1240 def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
1241           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
1242 def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
1243           (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
1244 def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
1245           (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
1246 def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
1247           (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
1248 def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
1249           (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
1250 def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
1251           (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
1252 def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
1253           (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
1255 def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
1256           (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
1257 def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
1258           (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
1259 def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
1260           (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
1261 def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
1262           (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
1264 def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
1265           (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
1266 def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
1267           (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
1268 def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
1269           (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
1270 def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
1271           (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
1273 def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
1274           (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
1275 def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
1276           (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
1277 def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
1278           (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
1279 def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
1280           (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
1282 def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
1283           (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
1284 def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
1285           (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
1286 def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
1287           (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
1288 def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
1289           (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
1291 def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
1292           (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
1293 def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
1294           (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
1295 def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
1296           (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
1297 def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
1298           (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
1299 def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
1300           (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
1301 def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
1302           (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
1303 def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
1304           (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
1305 def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
1306           (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
1308 def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
1309           (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
1310 def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
1311           (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
1312 def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
1313           (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
1314 def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
1315           (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
1316 def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
1317           (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
1318 def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
1319           (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
1320 def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
1321           (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
1322 def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
1323           (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
1325 def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
1326           (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
1327 def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
1328           (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
1329 def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
1330           (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
1331 def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
1332           (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
1334 def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
1335           (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
1336 def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
1337           (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
1338 def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
1339           (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
1340 def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
1341           (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
1343 def : Pat<(int_nvvm_ff2bf16x2_rn Float32Regs:$a, Float32Regs:$b),
1344           (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
1345 def : Pat<(int_nvvm_ff2bf16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
1346           (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
1347 def : Pat<(int_nvvm_ff2bf16x2_rz Float32Regs:$a, Float32Regs:$b),
1348           (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
1349 def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
1350           (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
1352 def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
1353           (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
1354 def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
1355           (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
1356 def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
1357           (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
1358 def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
1359           (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
1361 def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
1362           (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
1363 def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
1364           (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
1365 def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
1366           (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
1367 def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
1368           (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
1370 def CVT_tf32_f32 :
1371    NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
1372                    "cvt.rna.tf32.f32 \t$dest, $a;",
1373        [(set Int32Regs:$dest, (int_nvvm_f2tf32_rna Float32Regs:$a))]>;
1375 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
1376   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
1378 def INT_NVVM_D2I_LO : F_MATH_1<
1379   !strconcat("{{\n\t",
1380              ".reg .b32 %temp; \n\t",
1381              "mov.b64 \t{$dst, %temp}, $src0;\n\t",
1382              "}}"),
1383   Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
1384 def INT_NVVM_D2I_HI : F_MATH_1<
1385   !strconcat("{{\n\t",
1386              ".reg .b32 %temp; \n\t",
1387              "mov.b64 \t{%temp, $dst}, $src0;\n\t",
1388              "}}"),
1389   Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
1391 def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
1392           (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
1393 def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
1394           (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
1395 def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
1396           (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
1397 def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
1398           (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
1399 def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
1400           (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
1401 def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
1402           (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
1403 def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
1404           (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
1405 def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
1406           (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
1408 def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
1409           (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
1410 def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
1411           (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
1412 def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
1413           (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
1414 def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
1415           (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
1416 def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
1417           (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
1418 def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
1419           (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
1420 def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
1421           (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
1422 def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
1423           (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
1425 def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
1426           (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
1427 def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
1428           (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
1429 def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
1430           (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
1431 def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
1432           (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
1434 def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
1435           (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
1436 def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
1437           (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
1438 def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
1439           (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
1440 def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
1441           (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
1443 def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
1444           (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
1445 def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
1446           (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
1447 def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
1448           (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
1449 def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
1450           (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
1452 def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
1453           (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
1454 def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
1455           (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
1456 def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
1457           (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
1458 def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
1459           (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
1461 def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
1462           (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
1463 def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
1464           (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
1465 def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
1466           (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
1467 def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
1468           (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
1470 def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
1471           (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
1472 def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
1473           (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
1474 def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
1475           (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
1476 def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
1477           (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
1480 def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
1481           (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
1482 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
1483           (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
1486 // Bitcast
1489 def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
1490   Float32Regs, int_nvvm_bitcast_f2i>;
1491 def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
1492   Int32Regs, int_nvvm_bitcast_i2f>;
1494 def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
1495   Int64Regs, int_nvvm_bitcast_ll2d>;
1496 def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
1497   Float64Regs, int_nvvm_bitcast_d2ll>;
1500 // FNS
1503 class INT_FNS_MBO<dag ins, dag Operands>
1504   : NVPTXInst<(outs Int32Regs:$dst), ins,
1505                "fns.b32 \t$dst, $mask, $base, $offset;",
1506                [(set Int32Regs:$dst, Operands )]>,
1507     Requires<[hasPTX<60>, hasSM<30>]>;
1509 def INT_FNS_rrr : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset),
1510                      (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset)>;
1511 def INT_FNS_rri : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base,    i32imm:$offset),
1512                      (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base,       imm:$offset)>;
1513 def INT_FNS_rir : INT_FNS_MBO<(ins Int32Regs:$mask,    i32imm:$base, Int32Regs:$offset),
1514                      (int_nvvm_fns Int32Regs:$mask,       imm:$base, Int32Regs:$offset)>;
1515 def INT_FNS_rii : INT_FNS_MBO<(ins Int32Regs:$mask,    i32imm:$base,    i32imm:$offset),
1516                      (int_nvvm_fns Int32Regs:$mask,       imm:$base,       imm:$offset)>;
1517 def INT_FNS_irr : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base, Int32Regs:$offset),
1518                      (int_nvvm_fns       imm:$mask, Int32Regs:$base, Int32Regs:$offset)>;
1519 def INT_FNS_iri : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base,    i32imm:$offset),
1520                      (int_nvvm_fns       imm:$mask, Int32Regs:$base,       imm:$offset)>;
1521 def INT_FNS_iir : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base, Int32Regs:$offset),
1522                      (int_nvvm_fns       imm:$mask,       imm:$base, Int32Regs:$offset)>;
1523 def INT_FNS_iii : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base,    i32imm:$offset),
1524                      (int_nvvm_fns       imm:$mask,       imm:$base,       imm:$offset)>;
1526 //-----------------------------------
1527 // Atomic Functions
1528 //-----------------------------------
1530 class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
1531  : PatFrag<ops, frag, AS_match.global>;
1532 class ATOMIC_SHARED_CHK <dag ops, dag frag>
1533  : PatFrag<ops, frag, AS_match.shared>;
1534 class ATOMIC_GENERIC_CHK <dag ops, dag frag>
1535  : PatFrag<ops, frag, AS_match.generic>;
1537 multiclass F_ATOMIC_2_imp<ValueType ptrT, NVPTXRegClass ptrclass,
1538   ValueType regT, NVPTXRegClass regclass,
1539   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
1540   Operand IMMType, SDNode IMM, list<Predicate> Pred> {
1541   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
1542     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
1543     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
1544   Requires<Pred>;
1545   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
1546     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
1547     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
1548   Requires<!if(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16")), [Predicate<"false">], Pred)>;
1550 multiclass F_ATOMIC_2<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
1551   string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
1552   list<Predicate> Pred = []> {
1553   defm p32 : F_ATOMIC_2_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1554     IntOp, IMMType, IMM, Pred>;
1555   defm p64 : F_ATOMIC_2_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1556     IntOp, IMMType, IMM, Pred>;
1559 // has 2 operands, neg the second one
1560 multiclass F_ATOMIC_2_NEG_imp<ValueType ptrT, NVPTXRegClass ptrclass,
1561   ValueType regT, NVPTXRegClass regclass,
1562   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
1563   list<Predicate> Pred> {
1564   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
1565     !strconcat(
1566       "{{ \n\t",
1567       ".reg \t.s", TypeStr, " temp; \n\t",
1568       "neg.s", TypeStr, " \ttemp, $b; \n\t",
1569       "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
1570       "}}"),
1571     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b)))]>,
1572   Requires<Pred>;
1574 multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceStr,
1575   string TypeStr, string OpcStr, PatFrag IntOp, list<Predicate> Pred = []> {
1576  defm p32: F_ATOMIC_2_NEG_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1577    IntOp, Pred> ;
1578  defm p64: F_ATOMIC_2_NEG_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1579    IntOp, Pred> ;
1582 // has 3 operands
1583 multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
1584   ValueType regT, NVPTXRegClass regclass,
1585   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
1586   Operand IMMType, list<Predicate> Pred> {
1587   def reg : NVPTXInst<(outs regclass:$dst),
1588     (ins ptrclass:$addr, regclass:$b, regclass:$c),
1589     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
1590     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
1591   Requires<Pred>;
1593   def imm1 : NVPTXInst<(outs regclass:$dst),
1594     (ins ptrclass:$addr, IMMType:$b, regclass:$c),
1595     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
1596     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
1597   Requires<Pred>;
1599   def imm2 : NVPTXInst<(outs regclass:$dst),
1600     (ins ptrclass:$addr, regclass:$b, IMMType:$c),
1601     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
1602     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
1603   Requires<Pred>;
1605   def imm3 : NVPTXInst<(outs regclass:$dst),
1606     (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
1607     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
1608     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
1609   Requires<Pred>;
1611 multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
1612   string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
1613   defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1614     IntOp, IMMType, Pred>;
1615   defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
1616     IntOp, IMMType, Pred>;
1619 // atom_add
1621 def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1622   (atomic_load_add_i32 node:$a, node:$b)>;
1623 def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1624   (atomic_load_add_i32 node:$a, node:$b)>;
1625 def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1626   (atomic_load_add_i32 node:$a, node:$b)>;
1627 def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1628   (atomic_load_add_i64 node:$a, node:$b)>;
1629 def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1630   (atomic_load_add_i64 node:$a, node:$b)>;
1631 def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1632   (atomic_load_add_i64 node:$a, node:$b)>;
1633 def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1634   (atomic_load_fadd node:$a, node:$b)>;
1635 def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1636   (atomic_load_fadd node:$a, node:$b)>;
1637 def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1638   (atomic_load_fadd node:$a, node:$b)>;
1640 defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".add",
1641   atomic_load_add_i32_g, i32imm, imm>;
1642 defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".add",
1643   atomic_load_add_i32_s, i32imm, imm>;
1644 defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".add",
1645   atomic_load_add_i32_gen, i32imm, imm>;
1646 defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
1647   ".add", atomic_load_add_i32_gen, i32imm, imm>;
1649 defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", ".add",
1650   atomic_load_add_i64_g, i64imm, imm>;
1651 defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", ".add",
1652   atomic_load_add_i64_s, i64imm, imm>;
1653 defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
1654   atomic_load_add_i64_gen, i64imm, imm>;
1655 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
1656   ".add", atomic_load_add_i64_gen, i64imm, imm>;
1658 defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
1659   atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
1660 defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz",
1661   atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
1662 defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
1663   atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
1665 defm INT_PTX_ATOM_ADD_G_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".global", ".bf16", ".add.noftz",
1666   atomic_load_add_g, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
1667 defm INT_PTX_ATOM_ADD_S_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".shared", ".bf16", ".add.noftz",
1668   atomic_load_add_s, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
1669 defm INT_PTX_ATOM_ADD_GEN_BF16 : F_ATOMIC_2<bf16, Int16Regs, "", ".bf16", ".add.noftz",
1670   atomic_load_add_gen, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
1672 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
1673   atomic_load_add_g, f32imm, fpimm>;
1674 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
1675   atomic_load_add_s, f32imm, fpimm>;
1676 defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<f32, Float32Regs, "", ".f32", ".add",
1677   atomic_load_add_gen, f32imm, fpimm>;
1679 defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<f64, Float64Regs, ".global", ".f64", ".add",
1680   atomic_load_add_g, f64imm, fpimm, [hasAtomAddF64]>;
1681 defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<f64, Float64Regs, ".shared", ".f64", ".add",
1682   atomic_load_add_s, f64imm, fpimm, [hasAtomAddF64]>;
1683 defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<f64, Float64Regs, "", ".f64", ".add",
1684   atomic_load_add_gen, f64imm, fpimm, [hasAtomAddF64]>;
1686 // atom_sub
1688 def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1689   (atomic_load_sub_i32 node:$a, node:$b)>;
1690 def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1691   (atomic_load_sub_i32 node:$a, node:$b)>;
1692 def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1693   (atomic_load_sub_i32 node:$a, node:$b)>;
1694 def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1695   (atomic_load_sub_i64 node:$a, node:$b)>;
1696 def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1697   (atomic_load_sub_i64 node:$a, node:$b)>;
1698 def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1699   (atomic_load_sub_i64 node:$a, node:$b)>;
1701 defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", ".add",
1702   atomic_load_sub_i32_g>;
1703 defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", ".add",
1704   atomic_load_sub_i64_g>;
1705 defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<i32, Int32Regs, "", "32", ".add",
1706   atomic_load_sub_i32_gen>;
1707 defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32",
1708   ".add", atomic_load_sub_i32_gen>;
1709 defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".shared", "32", ".add",
1710   atomic_load_sub_i32_s>;
1711 defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".shared", "64", ".add",
1712   atomic_load_sub_i64_s>;
1713 defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<i64, Int64Regs, "", "64", ".add",
1714   atomic_load_sub_i64_gen>;
1715 defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64",
1716   ".add", atomic_load_sub_i64_gen>;
1718 // atom_swap
1720 def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1721   (atomic_swap_i32 node:$a, node:$b)>;
1722 def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1723   (atomic_swap_i32 node:$a, node:$b)>;
1724 def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1725   (atomic_swap_i32 node:$a, node:$b)>;
1726 def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1727   (atomic_swap_i64 node:$a, node:$b)>;
1728 def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1729   (atomic_swap_i64 node:$a, node:$b)>;
1730 def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1731   (atomic_swap_i64 node:$a, node:$b)>;
1733 defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".exch",
1734   atomic_swap_i32_g, i32imm, imm>;
1735 defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".exch",
1736   atomic_swap_i32_s, i32imm, imm>;
1737 defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".exch",
1738   atomic_swap_i32_gen, i32imm, imm>;
1739 defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
1740   ".exch", atomic_swap_i32_gen, i32imm, imm>;
1741 defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".exch",
1742   atomic_swap_i64_g, i64imm, imm>;
1743 defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".exch",
1744   atomic_swap_i64_s, i64imm, imm>;
1745 defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".exch",
1746   atomic_swap_i64_gen, i64imm, imm>;
1747 defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
1748   ".exch", atomic_swap_i64_gen, i64imm, imm>;
1750 // atom_max
1752 def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
1753   , (atomic_load_max_i32 node:$a, node:$b)>;
1754 def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1755   (atomic_load_max_i32 node:$a, node:$b)>;
1756 def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1757   (atomic_load_max_i32 node:$a, node:$b)>;
1758 def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
1759   , (atomic_load_max_i64 node:$a, node:$b)>;
1760 def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1761   (atomic_load_max_i64 node:$a, node:$b)>;
1762 def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1763   (atomic_load_max_i64 node:$a, node:$b)>;
1764 def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1765   (atomic_load_umax_i32 node:$a, node:$b)>;
1766 def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1767   (atomic_load_umax_i32 node:$a, node:$b)>;
1768 def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1769   (atomic_load_umax_i32 node:$a, node:$b)>;
1770 def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1771   (atomic_load_umax_i64 node:$a, node:$b)>;
1772 def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1773   (atomic_load_umax_i64 node:$a, node:$b)>;
1774 def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1775   (atomic_load_umax_i64 node:$a, node:$b)>;
1777 defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32",
1778   ".max", atomic_load_max_i32_g, i32imm, imm>;
1779 defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32",
1780   ".max", atomic_load_max_i32_s, i32imm, imm>;
1781 defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".max",
1782   atomic_load_max_i32_gen, i32imm, imm>;
1783 defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
1784   ".s32", ".max", atomic_load_max_i32_gen, i32imm, imm>;
1785 defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64",
1786   ".max", atomic_load_max_i64_g, i64imm, imm, [hasSM<32>]>;
1787 defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64",
1788   ".max", atomic_load_max_i64_s, i64imm, imm, [hasSM<32>]>;
1789 defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".max",
1790   atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>;
1791 defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
1792   ".s64", ".max", atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>;
1793 defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
1794   ".max", atomic_load_umax_i32_g, i32imm, imm>;
1795 defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32",
1796   ".max", atomic_load_umax_i32_s, i32imm, imm>;
1797 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".max",
1798   atomic_load_umax_i32_gen, i32imm, imm>;
1799 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
1800   ".u32", ".max", atomic_load_umax_i32_gen, i32imm, imm>;
1801 defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
1802   ".max", atomic_load_umax_i64_g, i64imm, imm, [hasSM<32>]>;
1803 defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64",
1804   ".max", atomic_load_umax_i64_s, i64imm, imm, [hasSM<32>]>;
1805 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".max",
1806   atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>;
1807 defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
1808   ".u64", ".max", atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>;
1810 // atom_min
1812 def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1813   (atomic_load_min_i32 node:$a, node:$b)>;
1814 def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1815   (atomic_load_min_i32 node:$a, node:$b)>;
1816 def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1817   (atomic_load_min_i32 node:$a, node:$b)>;
1818 def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1819   (atomic_load_min_i64 node:$a, node:$b)>;
1820 def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1821   (atomic_load_min_i64 node:$a, node:$b)>;
1822 def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1823   (atomic_load_min_i64 node:$a, node:$b)>;
1824 def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1825   (atomic_load_umin_i32 node:$a, node:$b)>;
1826 def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1827   (atomic_load_umin_i32 node:$a, node:$b)>;
1828 def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1829   (atomic_load_umin_i32 node:$a, node:$b)>;
1830 def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1831   (atomic_load_umin_i64 node:$a, node:$b)>;
1832 def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1833   (atomic_load_umin_i64 node:$a, node:$b)>;
1834 def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1835   (atomic_load_umin_i64 node:$a, node:$b)>;
1837 defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32",
1838   ".min", atomic_load_min_i32_g, i32imm, imm>;
1839 defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32",
1840   ".min", atomic_load_min_i32_s, i32imm, imm>;
1841 defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".min",
1842   atomic_load_min_i32_gen, i32imm, imm>;
1843 defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
1844   ".s32", ".min", atomic_load_min_i32_gen, i32imm, imm>;
1845 defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64",
1846   ".min", atomic_load_min_i64_g, i64imm, imm, [hasSM<32>]>;
1847 defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64",
1848   ".min", atomic_load_min_i64_s, i64imm, imm, [hasSM<32>]>;
1849 defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".min",
1850   atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>;
1851 defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
1852   ".s64", ".min", atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>;
1853 defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
1854   ".min", atomic_load_umin_i32_g, i32imm, imm>;
1855 defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32",
1856   ".min", atomic_load_umin_i32_s, i32imm, imm>;
1857 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".min",
1858   atomic_load_umin_i32_gen, i32imm, imm>;
1859 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global",
1860   ".u32", ".min", atomic_load_umin_i32_gen, i32imm, imm>;
1861 defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
1862   ".min", atomic_load_umin_i64_g, i64imm, imm, [hasSM<32>]>;
1863 defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64",
1864   ".min", atomic_load_umin_i64_s, i64imm, imm, [hasSM<32>]>;
1865 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".min",
1866   atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>;
1867 defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global",
1868   ".u64", ".min", atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>;
1870 // atom_inc  atom_dec
1872 def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1873   (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
1874 def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1875   (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
1876 def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1877   (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
1878 def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1879   (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
1880 def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1881   (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
1882 def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1883   (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
1885 defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".inc",
1886   atomic_load_inc_32_g, i32imm, imm>;
1887 defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".inc",
1888   atomic_load_inc_32_s, i32imm, imm>;
1889 defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".inc",
1890   atomic_load_inc_32_gen, i32imm, imm>;
1891 defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
1892   ".inc", atomic_load_inc_32_gen, i32imm, imm>;
1893 defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".dec",
1894   atomic_load_dec_32_g, i32imm, imm>;
1895 defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".dec",
1896   atomic_load_dec_32_s, i32imm, imm>;
1897 defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".dec",
1898   atomic_load_dec_32_gen, i32imm, imm>;
1899 defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32",
1900   ".dec", atomic_load_dec_32_gen, i32imm, imm>;
1902 // atom_and
1904 def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1905   (atomic_load_and_i32 node:$a, node:$b)>;
1906 def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1907   (atomic_load_and_i32 node:$a, node:$b)>;
1908 def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1909   (atomic_load_and_i32 node:$a, node:$b)>;
1910 def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1911   (atomic_load_and_i64 node:$a, node:$b)>;
1912 def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1913   (atomic_load_and_i64 node:$a, node:$b)>;
1914 def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1915   (atomic_load_and_i64 node:$a, node:$b)>;
1917 defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".and",
1918   atomic_load_and_i32_g, i32imm, imm>;
1919 defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".and",
1920   atomic_load_and_i32_s, i32imm, imm>;
1921 defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".and",
1922   atomic_load_and_i32_gen, i32imm, imm>;
1923 defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
1924   ".and", atomic_load_and_i32_gen, i32imm, imm>;
1925 defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".and",
1926   atomic_load_and_i64_g, i64imm, imm, [hasSM<32>]>;
1927 defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".and",
1928   atomic_load_and_i64_s, i64imm, imm, [hasSM<32>]>;
1929 defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".and",
1930   atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>;
1931 defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
1932   ".and", atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>;
1934 // atom_or
1936 def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1937   (atomic_load_or_i32 node:$a, node:$b)>;
1938 def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1939   (atomic_load_or_i32 node:$a, node:$b)>;
1940 def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1941   (atomic_load_or_i32 node:$a, node:$b)>;
1942 def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1943   (atomic_load_or_i64 node:$a, node:$b)>;
1944 def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1945   (atomic_load_or_i64 node:$a, node:$b)>;
1946 def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1947   (atomic_load_or_i64 node:$a, node:$b)>;
1949 defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".or",
1950   atomic_load_or_i32_g, i32imm, imm>;
1951 defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".or",
1952   atomic_load_or_i32_gen, i32imm, imm>;
1953 defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
1954   ".or", atomic_load_or_i32_gen, i32imm, imm>;
1955 defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".or",
1956   atomic_load_or_i32_s, i32imm, imm>;
1957 defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".or",
1958   atomic_load_or_i64_g, i64imm, imm, [hasSM<32>]>;
1959 defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".or",
1960   atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>;
1961 defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
1962   ".or", atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>;
1963 defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".or",
1964   atomic_load_or_i64_s, i64imm, imm, [hasSM<32>]>;
1966 // atom_xor
1968 def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1969   (atomic_load_xor_i32 node:$a, node:$b)>;
1970 def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1971   (atomic_load_xor_i32 node:$a, node:$b)>;
1972 def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1973   (atomic_load_xor_i32 node:$a, node:$b)>;
1974 def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
1975   (atomic_load_xor_i64 node:$a, node:$b)>;
1976 def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
1977   (atomic_load_xor_i64 node:$a, node:$b)>;
1978 def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
1979   (atomic_load_xor_i64 node:$a, node:$b)>;
1981 defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".xor",
1982   atomic_load_xor_i32_g, i32imm, imm>;
1983 defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".xor",
1984   atomic_load_xor_i32_s, i32imm, imm>;
1985 defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".xor",
1986   atomic_load_xor_i32_gen, i32imm, imm>;
1987 defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32",
1988   ".xor", atomic_load_xor_i32_gen, i32imm, imm>;
1989 defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".xor",
1990   atomic_load_xor_i64_g, i64imm, imm, [hasSM<32>]>;
1991 defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".xor",
1992   atomic_load_xor_i64_s, i64imm, imm, [hasSM<32>]>;
1993 defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
1994   atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
1995 defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
1996   ".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
1998 // atom_cas
2000 def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2001   (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2002 def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2003   (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2004 def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2005   (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2006 def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2007   (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2008 def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2009   (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2010 def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2011   (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2013 defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
2014   atomic_cmp_swap_i32_g, i32imm>;
2015 defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
2016   atomic_cmp_swap_i32_s, i32imm>;
2017 defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
2018   atomic_cmp_swap_i32_gen, i32imm>;
2019 defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
2020   ".cas", atomic_cmp_swap_i32_gen, i32imm>;
2021 defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
2022   atomic_cmp_swap_i64_g, i64imm>;
2023 defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
2024   atomic_cmp_swap_i64_s, i64imm>;
2025 defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
2026   atomic_cmp_swap_i64_gen, i64imm>;
2027 defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
2028   ".cas", atomic_cmp_swap_i64_gen, i64imm>;
2030 // Support for scoped atomic operations.  Matches
2031 // int_nvvm_atomic_{op}_{space}_{type}_{scope}
2032 // and converts it into the appropriate instruction.
2033 // NOTE: not all possible combinations are implemented
2034 //  'space' is limited to generic as it's the only one needed to support CUDA.
2035 //  'scope' = 'gpu' is default and is handled by regular atomic instructions.
2036 class ATOM23_impl<string AsmStr, ValueType regT, NVPTXRegClass regclass, list<Predicate> Preds,
2037                   dag ins, dag Operands>
2038       : NVPTXInst<(outs regclass:$result), ins,
2039                   AsmStr,
2040                   [(set (regT regclass:$result), Operands)]>,
2041         Requires<Preds>;
2043 // Define instruction variants for all addressing modes.
2044 multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
2045                        ValueType regT, NVPTXRegClass regclass, Operand ImmType,
2046                        SDNode Imm, ValueType ImmTy,
2047                        list<Predicate> Preds> {
2048   let AddedComplexity = 1 in {
2049     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2050                       (ins Int16Regs:$src, regclass:$b),
2051                       (Intr (i16 Int16Regs:$src), (regT regclass:$b))>;
2052     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2053                       (ins Int32Regs:$src, regclass:$b),
2054                       (Intr (i32 Int32Regs:$src), (regT regclass:$b))>;
2055     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2056                       (ins Int64Regs:$src, regclass:$b),
2057                       (Intr (i64 Int64Regs:$src), (regT regclass:$b))>;
2058   }
2059   // tablegen can't infer argument types from Intrinsic (though it can
2060   // from Instruction) so we have to enforce specific type on
2061   // immediates via explicit cast to ImmTy.
2062   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2063                     (ins Int16Regs:$src, ImmType:$b),
2064                     (Intr (i16 Int16Regs:$src), (ImmTy Imm:$b))>;
2065   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2066                     (ins Int32Regs:$src, ImmType:$b),
2067                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b))>;
2068   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2069                     (ins Int64Regs:$src, ImmType:$b),
2070                     (Intr (i64 Int64Regs:$src), (ImmTy Imm:$b))>;
2073 multiclass ATOM3P_impl<string AsmStr,  Intrinsic Intr,
2074                        ValueType regT, NVPTXRegClass regclass,
2075                        Operand ImmType, SDNode Imm, ValueType ImmTy,
2076                        list<Predicate> Preds> {
2077   // Variants for register/immediate permutations of $b and $c
2078   let AddedComplexity = 2 in {
2079     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2080                       (ins Int32Regs:$src, regclass:$b, regclass:$c),
2081                       (Intr (i32 Int32Regs:$src), (regT regclass:$b), (regT regclass:$c))>;
2082     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2083                       (ins Int64Regs:$src, regclass:$b, regclass:$c),
2084                       (Intr (i64 Int64Regs:$src), (regT regclass:$b), (regT regclass:$c))>;
2085   }
2086   let AddedComplexity = 1 in {
2087     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2088                       (ins Int32Regs:$src, ImmType:$b, regclass:$c),
2089                       (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b), (regT regclass:$c))>;
2090     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2091                       (ins Int64Regs:$src, ImmType:$b, regclass:$c),
2092                       (Intr (i64 Int64Regs:$src), (ImmTy Imm:$b), (regT regclass:$c))>;
2093     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2094                       (ins Int32Regs:$src, regclass:$b, ImmType:$c),
2095                       (Intr (i32 Int32Regs:$src), (regT regclass:$b), (ImmTy Imm:$c))>;
2096     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2097                       (ins Int64Regs:$src, regclass:$b, ImmType:$c),
2098                       (Intr (i64 Int64Regs:$src), (regT regclass:$b), (ImmTy Imm:$c))>;
2099   }
2100   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2101                     (ins Int32Regs:$src, ImmType:$b, ImmType:$c),
2102                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b), (ImmTy Imm:$c))>;
2103   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
2104                     (ins Int64Regs:$src, ImmType:$b, ImmType:$c),
2105                     (Intr (i64 Int64Regs:$src), (ImmTy Imm:$b), (ImmTy Imm:$c))>;
2108 // Constructs intrinsic name and instruction asm strings.
2109 multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
2110                        string ScopeStr, string SpaceStr,
2111                        ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
2112                        ValueType ImmTy, list<Predicate> Preds> {
2113   defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
2114                             # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
2115                             # "." # OpStr # "." # TypeStr
2116                             # " \t$result, [$src], $b;",
2117                      !cast<Intrinsic>(
2118                             "int_nvvm_atomic_" # OpStr
2119                             # "_" # SpaceStr # "_" # IntTypeStr
2120                             # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
2121                      regT, regclass, ImmType, Imm, ImmTy, Preds>;
2123 multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
2124                        string ScopeStr, string SpaceStr,
2125                        ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
2126                        ValueType ImmTy, list<Predicate> Preds> {
2127   defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
2128                             # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
2129                             # "." # OpStr # "." # TypeStr
2130                             # " \t$result, [$src], $b, $c;",
2131                      !cast<Intrinsic>(
2132                             "int_nvvm_atomic_" # OpStr
2133                             # "_" # SpaceStr # "_" # IntTypeStr
2134                             # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
2135                      regT, regclass, ImmType, Imm, ImmTy, Preds>;
2138 // Constructs variants for different address spaces.
2139 // For now we only need variants for generic space pointers.
2140 multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr,
2141                        string ScopeStr, ValueType regT, NVPTXRegClass regclass, Operand ImmType,
2142                        SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
2143    defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
2144                             regT, regclass, ImmType, Imm, ImmTy, Preds>;
2146 multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr,
2147                        string ScopeStr, ValueType regT, NVPTXRegClass regclass, Operand ImmType,
2148                        SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
2149    defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
2150                             regT, regclass, ImmType, Imm, ImmTy, Preds>;
2153 // Constructs variants for different scopes of atomic op.
2154 multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
2155                        ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
2156                        ValueType ImmTy, list<Predicate> Preds> {
2157    // .gpu scope is default and is currently covered by existing
2158    // atomics w/o explicitly specified scope.
2159    defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta",
2160                            regT, regclass, ImmType, Imm, ImmTy,
2161                            !listconcat(Preds,[hasAtomScope])>;
2162    defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys",
2163                            regT, regclass, ImmType, Imm, ImmTy,
2164                            !listconcat(Preds,[hasAtomScope])>;
2166 multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
2167            ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy,
2168            list<Predicate> Preds> {
2169    // No need to define ".gpu"-scoped atomics.  They do the same thing
2170    // as the regular, non-scoped atomics defined elsewhere.
2171    defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta",
2172                            regT, regclass, ImmType, Imm, ImmTy,
2173                            !listconcat(Preds,[hasAtomScope])>;
2174    defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys",
2175                            regT, regclass, ImmType, Imm, ImmTy,
2176                            !listconcat(Preds,[hasAtomScope])>;
2179 // atom.add
2180 multiclass ATOM2_add_impl<string OpStr> {
2181    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
2182    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
2183    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
2184    defm _bf16  : ATOM2S_impl<OpStr, "f", "bf16", bf16, Int16Regs, bf16imm, fpimm, bf16,
2185                             [hasSM<90>, hasPTX<78>]>;
2186    defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
2187                             [hasSM<70>, hasPTX<63>]>;
2188    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
2189                             []>;
2190    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64,
2191                             [hasAtomAddF64]>;
2194 // atom.{and,or,xor}
2195 multiclass ATOM2_bitwise_impl<string OpStr> {
2196    defm _b32  : ATOM2S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
2197    defm _b64  : ATOM2S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64,
2198                             [hasAtomBitwise64]>;
2201 // atom.exch
2202 multiclass ATOM2_exch_impl<string OpStr> {
2203    defm _b32 : ATOM2S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
2204    defm _b64 : ATOM2S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
2207 // atom.{min,max}
2208 multiclass ATOM2_minmax_impl<string OpStr> {
2209    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
2210    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
2211    defm _s64  : ATOM2S_impl<OpStr, "i", "s64", i64, Int64Regs, i64imm, imm, i64,
2212                             [hasAtomMinMax64]>;
2213    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64,
2214                             [hasAtomMinMax64]>;
2217 // atom.{inc,dec}
2218 multiclass ATOM2_incdec_impl<string OpStr> {
2219    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
2222 // atom.cas
2223 multiclass ATOM3_cas_impl<string OpStr> {
2224    defm _b32  : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
2225    defm _b64  : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
2228 defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">;
2229 defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">;
2230 defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">;
2231 defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">;
2232 defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">;
2233 defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">;
2234 defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">;
2235 defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
2236 defm INT_PTX_SATOM_OR  : ATOM2_bitwise_impl<"or">;
2237 defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
2239 //-----------------------------------
2240 // Support for ldu on sm_20 or later
2241 //-----------------------------------
2243 // Don't annotate ldu instructions as mayLoad, as they load from memory that is
2244 // read-only in a kernel.
2246 // Scalar
2248 multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
2249   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
2250                !strconcat("ldu.global.", TyStr),
2251                       []>, Requires<[hasLDU]>;
2252   def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
2253                !strconcat("ldu.global.", TyStr),
2254                         []>, Requires<[hasLDU]>;
2255  def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2256                !strconcat("ldu.global.", TyStr),
2257                       []>, Requires<[hasLDU]>;
2258  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2259                !strconcat("ldu.global.", TyStr),
2260                       []>, Requires<[hasLDU]>;
2261  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2262                !strconcat("ldu.global.", TyStr),
2263                         []>, Requires<[hasLDU]>;
2266 defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
2267 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
2268 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
2269 defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
2270 defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
2271 defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
2273 // vector
2275 // Elementized vector ldu
2276 multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
2277  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2278                      (ins Int32Regs:$src),
2279                      !strconcat("ldu.global.", TyStr), []>;
2280  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2281                      (ins Int64Regs:$src),
2282                      !strconcat("ldu.global.", TyStr), []>;
2283  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2284                      (ins MEMri:$src),
2285                      !strconcat("ldu.global.", TyStr), []>;
2286  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2287                      (ins MEMri64:$src),
2288                      !strconcat("ldu.global.", TyStr), []>;
2289  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2290                      (ins imemAny:$src),
2291                      !strconcat("ldu.global.", TyStr), []>;
2294 multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
2295  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2296                             regclass:$dst4), (ins Int32Regs:$src),
2297                !strconcat("ldu.global.", TyStr), []>;
2298  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2299                             regclass:$dst4), (ins Int64Regs:$src),
2300                !strconcat("ldu.global.", TyStr), []>;
2301  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2302                             regclass:$dst4), (ins MEMri:$src),
2303                !strconcat("ldu.global.", TyStr), []>;
2304  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2305                             regclass:$dst4), (ins MEMri64:$src),
2306                !strconcat("ldu.global.", TyStr), []>;
2307  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2308                             regclass:$dst4), (ins imemAny:$src),
2309                !strconcat("ldu.global.", TyStr), []>;
2312 defm INT_PTX_LDU_G_v2i8_ELE
2313   : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
2314 defm INT_PTX_LDU_G_v2i16_ELE
2315   : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2316 defm INT_PTX_LDU_G_v2i32_ELE
2317   : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2318 defm INT_PTX_LDU_G_v2f32_ELE
2319   : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2320 defm INT_PTX_LDU_G_v2i64_ELE
2321   : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2322 defm INT_PTX_LDU_G_v2f64_ELE
2323   : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2324 defm INT_PTX_LDU_G_v4i8_ELE
2325   : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2326 defm INT_PTX_LDU_G_v4i16_ELE
2327   : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2328     Int16Regs>;
2329 defm INT_PTX_LDU_G_v4i32_ELE
2330   : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2331     Int32Regs>;
2332 defm INT_PTX_LDU_G_v4f16_ELE
2333   : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2334     Int16Regs>;
2335 defm INT_PTX_LDU_G_v4f16x2_ELE
2336   : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2337     Int32Regs>;
2338 defm INT_PTX_LDU_G_v4f32_ELE
2339   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2340     Float32Regs>;
2343 //-----------------------------------
2344 // Support for ldg on sm_35 or later
2345 //-----------------------------------
2347 // Don't annotate ld.global.nc as mayLoad, because these loads go through the
2348 // non-coherent texture cache, and therefore the values read must be read-only
2349 // during the lifetime of the kernel.
2351 multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
2352   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
2353                !strconcat("ld.global.nc.", TyStr),
2354                       []>, Requires<[hasLDG]>;
2355   def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
2356                !strconcat("ld.global.nc.", TyStr),
2357                         []>, Requires<[hasLDG]>;
2358  def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2359                !strconcat("ld.global.nc.", TyStr),
2360                       []>, Requires<[hasLDG]>;
2361  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2362                !strconcat("ld.global.nc.", TyStr),
2363                       []>, Requires<[hasLDG]>;
2364  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2365                !strconcat("ld.global.nc.", TyStr),
2366                         []>, Requires<[hasLDG]>;
2369 defm INT_PTX_LDG_GLOBAL_i8
2370   : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
2371 defm INT_PTX_LDG_GLOBAL_i16
2372   : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
2373 defm INT_PTX_LDG_GLOBAL_i32
2374   : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
2375 defm INT_PTX_LDG_GLOBAL_i64
2376   : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
2377 defm INT_PTX_LDG_GLOBAL_f32
2378   : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
2379 defm INT_PTX_LDG_GLOBAL_f64
2380   : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
2382 // vector
2384 // Elementized vector ldg
2385 multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
2386  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2387                      (ins Int32Regs:$src),
2388                      !strconcat("ld.global.nc.", TyStr), []>;
2389  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2390                      (ins Int64Regs:$src),
2391                      !strconcat("ld.global.nc.", TyStr), []>;
2392  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2393                      (ins MEMri:$src),
2394                      !strconcat("ld.global.nc.", TyStr), []>;
2395  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2396                      (ins MEMri64:$src),
2397                      !strconcat("ld.global.nc.", TyStr), []>;
2398  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2399                      (ins imemAny:$src),
2400                      !strconcat("ld.global.nc.", TyStr), []>;
2403 multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
2404   def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2405                               regclass:$dst4), (ins Int32Regs:$src),
2406                !strconcat("ld.global.nc.", TyStr), []>;
2407   def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2408                                regclass:$dst4), (ins Int64Regs:$src),
2409                !strconcat("ld.global.nc.", TyStr), []>;
2410   def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2411                               regclass:$dst4), (ins MEMri:$src),
2412                !strconcat("ld.global.nc.", TyStr), []>;
2413   def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2414                               regclass:$dst4), (ins MEMri64:$src),
2415                !strconcat("ld.global.nc.", TyStr), []>;
2416   def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2417                              regclass:$dst4), (ins imemAny:$src),
2418                !strconcat("ld.global.nc.", TyStr), []>;
2421 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
2422 defm INT_PTX_LDG_G_v2i8_ELE
2423   : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
2424 defm INT_PTX_LDG_G_v2i16_ELE
2425   : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2426 defm INT_PTX_LDG_G_v2i32_ELE
2427   : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2428 defm INT_PTX_LDG_G_v2f32_ELE
2429   : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2430 defm INT_PTX_LDG_G_v2i64_ELE
2431   : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2432 defm INT_PTX_LDG_G_v2f64_ELE
2433   : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2434 defm INT_PTX_LDG_G_v4i8_ELE
2435   : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2436 defm INT_PTX_LDG_G_v4i16_ELE
2437   : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2438 defm INT_PTX_LDG_G_v4i32_ELE
2439   : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
2440 defm INT_PTX_LDG_G_v4f32_ELE
2441   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
2444 multiclass NG_TO_G<string Str, Intrinsic Intrin, Predicate ShortPtr> {
2445    def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2446           !strconcat("cvta.", Str, ".u32 \t$result, $src;"),
2447       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
2448    def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2449           !strconcat("cvta.", Str, ".u64 \t$result, $src;"),
2450       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
2451    def _6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src),
2452           "{{ .reg .b64 %tmp;\n\t"
2453           #"  cvt.u64.u32 \t%tmp, $src;\n\t"
2454           #"  cvta." # Str # ".u64 \t$result, %tmp; }}",
2455       [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>,
2456       Requires<[ShortPtr]>;
2459 multiclass G_TO_NG<string Str, Intrinsic Intrin, Predicate ShortPtr> {
2460    def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2461           !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
2462       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
2463    def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2464           !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
2465       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
2466    def _3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src),
2467           "{{ .reg .b64 %tmp;\n\t"
2468           #"  cvta.to." # Str # ".u64 \t%tmp, $src;\n\t"
2469           #"  cvt.u32.u64 \t$result, %tmp; }}",
2470       [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>,
2471       Requires<[ShortPtr]>;
2474 defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>;
2475 defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>;
2476 defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>;
2477 defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>;
2478 defm cvta_param  : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>;
2480 defm cvta_to_local  : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>;
2481 defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>;
2482 defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global, False>;
2483 defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant, useShortPtrConst>;
2485 // nvvm.ptr.gen.to.param
2486 def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
2487   (ins Int32Regs:$src),
2488                         "mov.u32 \t$result, $src;",
2489                               [(set Int32Regs:$result,
2490                                 (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
2491 def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
2492   (ins Int64Regs:$src),
2493                         "mov.u64 \t$result, $src;",
2494                               [(set Int64Regs:$result,
2495                                 (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
2498 // nvvm.move intrinsicc
2499 def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
2500                              "mov.b16 \t$r, $s;",
2501                              [(set Int16Regs:$r,
2502                                (int_nvvm_move_i16 Int16Regs:$s))]>;
2503 def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
2504                              "mov.b32 \t$r, $s;",
2505                              [(set Int32Regs:$r,
2506                                (int_nvvm_move_i32 Int32Regs:$s))]>;
2507 def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
2508                              "mov.b64 \t$r, $s;",
2509                              [(set Int64Regs:$r,
2510                                (int_nvvm_move_i64 Int64Regs:$s))]>;
2511 def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
2512                              "mov.f32 \t$r, $s;",
2513                              [(set Float32Regs:$r,
2514                                (int_nvvm_move_float Float32Regs:$s))]>;
2515 def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
2516                              "mov.f64 \t$r, $s;",
2517                              [(set Float64Regs:$r,
2518                                (int_nvvm_move_double Float64Regs:$s))]>;
2519 def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
2520                              "mov.u32 \t$r, $s;",
2521                              [(set Int32Regs:$r,
2522                                (int_nvvm_move_ptr Int32Regs:$s))]>;
2523 def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
2524                              "mov.u64 \t$r, $s;",
2525                              [(set Int64Regs:$r,
2526                                (int_nvvm_move_ptr Int64Regs:$s))]>;
2528 // @TODO: Are these actually needed, or will we always just see symbols
2529 // copied to registers first?
2530 /*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
2531                              "mov.u32 \t$r, $s;",
2532                              [(set Int32Regs:$r,
2533                              (int_nvvm_move_ptr texternalsym:$s))]>;
2534 def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
2535                              "mov.u64 \t$r, $s;",
2536                              [(set Int64Regs:$r,
2537                              (int_nvvm_move_ptr texternalsym:$s))]>;*/
2540 // MoveParam        %r1, param
2541 // ptr_local_to_gen %r2, %r1
2542 // ptr_gen_to_local %r3, %r2
2543 // ->
2544 // mov %r1, param
2546 // @TODO: Revisit this.  There is a type
2547 // contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
2548 // instructions are not currently defined. However, we can use the ptr
2549 // variants and the asm printer will do the right thing.
2550 def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
2551                 (MoveParam texternalsym:$src)))),
2552                (nvvm_move_ptr64  texternalsym:$src)>;
2553 def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
2554                 (MoveParam texternalsym:$src)))),
2555                (nvvm_move_ptr32  texternalsym:$src)>;
2557 def texsurf_handles
2558   : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
2559               "mov.u64 \t$result, $src;", []>;
2561 //-----------------------------------
2562 // Compiler Error Warn
2563 // - Just ignore them in codegen
2564 //-----------------------------------
2566 def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
2567                 "// llvm.nvvm.compiler.warn()",
2568                 [(int_nvvm_compiler_warn Int32Regs:$a)]>;
2569 def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
2570                 "// llvm.nvvm.compiler.warn()",
2571                 [(int_nvvm_compiler_warn Int64Regs:$a)]>;
2572 def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
2573                 "// llvm.nvvm.compiler.error()",
2574                 [(int_nvvm_compiler_error Int32Regs:$a)]>;
2575 def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
2576                 "// llvm.nvvm.compiler.error()",
2577                 [(int_nvvm_compiler_error Int64Regs:$a)]>;
2580 // isspacep
2582 multiclass ISSPACEP<string suffix, Intrinsic Intr, list<Predicate> Preds = []> {
2583   def _32: NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
2584               "isspacep." # suffix # "\t$d, $a;",
2585               [(set Int1Regs:$d, (Intr Int32Regs:$a))]>,
2586     Requires<Preds>;
2587   def _64: NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
2588               "isspacep." # suffix # "\t$d, $a;",
2589               [(set Int1Regs:$d, (Intr Int64Regs:$a))]>,
2590     Requires<Preds>;
2593 defm isspace_const  : ISSPACEP<"const", int_nvvm_isspacep_const, [hasPTX<31>]>;
2594 defm isspace_global : ISSPACEP<"global", int_nvvm_isspacep_global>;
2595 defm isspace_local  : ISSPACEP<"local", int_nvvm_isspacep_local>;
2596 defm isspace_shared : ISSPACEP<"shared", int_nvvm_isspacep_shared>;
2597 defm isspace_shared_cluster : ISSPACEP<"shared::cluster",
2598                                        int_nvvm_isspacep_shared_cluster,
2599                                        [hasPTX<78>, hasSM<90>]>;
2601 // Special register reads
2602 def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
2603                             (ins SpecialRegs:$r),
2604                             "mov.b32 \t$d, $r;", []>;
2606 def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
2607 def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
2608 def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
2609 def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
2610 def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
2611 def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
2612 def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
2613 def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
2614 def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
2615 def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
2616 def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
2617 def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
2618 def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
2619 def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
2620 def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
2621 def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
2622 def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
2623 def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
2624 def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
2625 def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
2626 def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
2627 def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
2628 def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
2629 def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
2630 def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
2631 def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
2632 def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
2633 def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
2634 def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
2635 def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
2636 def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
2637 def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
2640 // rotate builtin support
2642 def ROTATE_B32_HW_IMM
2643   : NVPTXInst<(outs Int32Regs:$dst),
2644               (ins  Int32Regs:$src, i32imm:$amt),
2645               "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
2646               [(set Int32Regs:$dst,
2647                  (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
2648               Requires<[hasHWROT32]> ;
2650 def ROTATE_B32_HW_REG
2651   : NVPTXInst<(outs Int32Regs:$dst),
2652               (ins  Int32Regs:$src, Int32Regs:$amt),
2653               "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
2654               [(set Int32Regs:$dst,
2655                  (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
2656               Requires<[hasHWROT32]> ;
2658 def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
2659           (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
2660       Requires<[noHWROT32]> ;
2662 def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
2663           (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
2664       Requires<[noHWROT32]> ;
2666 let hasSideEffects = false in {
2667   def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
2668     !strconcat("{{\n\t",
2669                ".reg .b32 %dummy;\n\t",
2670                "mov.b64 \t{$dst,%dummy}, $src;\n\t",
2671                "}}"),
2672           []> ;
2674   def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
2675     !strconcat("{{\n\t",
2676                ".reg .b32 %dummy;\n\t",
2677                "mov.b64 \t{%dummy,$dst}, $src;\n\t",
2678                "}}"),
2679           []> ;
2682 let hasSideEffects = false in {
2683   def PACK_TWO_INT32
2684     : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
2685                 "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
2688 def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
2689           (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
2690                           (GET_LO_INT64 Int64Regs:$src))> ;
2692 // Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
2693 // no side effects.
2694 let hasSideEffects = false in {
2695   def SHF_L_WRAP_B32_IMM
2696     : NVPTXInst<(outs Int32Regs:$dst),
2697                 (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
2698                 "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
2699       Requires<[hasHWROT32]>;
2701   def SHF_L_WRAP_B32_REG
2702     : NVPTXInst<(outs Int32Regs:$dst),
2703                 (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
2704                 "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
2705       Requires<[hasHWROT32]>;
2707   def SHF_R_WRAP_B32_IMM
2708     : NVPTXInst<(outs Int32Regs:$dst),
2709                 (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
2710                 "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
2711       Requires<[hasHWROT32]>;
2713   def SHF_R_WRAP_B32_REG
2714     : NVPTXInst<(outs Int32Regs:$dst),
2715                 (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
2716                 "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
2717       Requires<[hasHWROT32]>;
2720 // HW version of rotate 64
2721 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
2722           (PACK_TWO_INT32
2723             (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
2724                                 (GET_LO_INT64 Int64Regs:$src), imm:$amt),
2725             (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
2726                                 (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
2727       Requires<[hasHWROT32]>;
2729 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
2730           (PACK_TWO_INT32
2731             (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
2732                                 (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
2733             (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
2734                                (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
2735       Requires<[hasHWROT32]>;
2738 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
2739           (PACK_TWO_INT32
2740             (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
2741                                 (GET_HI_INT64 Int64Regs:$src), imm:$amt),
2742             (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
2743                                 (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
2744       Requires<[hasHWROT32]>;
2746 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
2747           (PACK_TWO_INT32
2748             (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
2749                                 (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
2750             (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
2751                                (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
2752       Requires<[hasHWROT32]>;
2754 // SW version of rotate 64
2755 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
2756           (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>,
2757       Requires<[noHWROT32]>;
2758 def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
2759           (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
2760       Requires<[noHWROT32]>;
2761 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
2762           (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
2763       Requires<[noHWROT32]>;
2764 def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
2765           (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
2766       Requires<[noHWROT32]>;
2769 //-----------------------------------
2770 // Texture Intrinsics
2771 //-----------------------------------
2773 // NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
2774 // also defined in NVPTXReplaceImageHandles.cpp
2776 // texmode_independent
2777 let IsTex = true, IsTexModeUnified = false in {
2778 // Texture fetch instructions using handles
2780 class TEX_1D_base<string inst, NVPTXRegClass outtype,
2781                   NVPTXRegClass intype, dag texsamp>
2782     : NVPTXInst<(outs outtype:$r, outtype:$g,
2783                       outtype:$b, outtype:$a),
2784                  !con(texsamp, (ins intype:$x)),
2785                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
2786                  []>;
2788 multiclass TEX_1D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
2789   def _RR : TEX_1D_base<inst, outtype, intype,
2790                         (ins Int64Regs:$t, Int64Regs:$s)>;
2791   def _RI : TEX_1D_base<inst, outtype, intype,
2792                         (ins Int64Regs:$t, i64imm:$s)>;
2793   def _IR : TEX_1D_base<inst, outtype, intype,
2794                         (ins i64imm:$t, Int64Regs:$s)>;
2795   def _II : TEX_1D_base<inst, outtype, intype,
2796                         (ins i64imm:$t, i64imm:$s)>;
2799 defm TEX_1D_F32_S32 : TEX_1D<"tex.1d.v4.f32.s32", Float32Regs, Int32Regs>;
2800 defm TEX_1D_F32_F32 : TEX_1D<"tex.1d.v4.f32.f32", Float32Regs, Float32Regs>;
2801 defm TEX_1D_S32_S32 : TEX_1D<"tex.1d.v4.s32.s32", Int32Regs, Int32Regs>;
2802 defm TEX_1D_S32_F32 : TEX_1D<"tex.1d.v4.s32.f32", Int32Regs, Float32Regs>;
2803 defm TEX_1D_U32_S32 : TEX_1D<"tex.1d.v4.u32.s32", Int32Regs, Int32Regs>;
2804 defm TEX_1D_U32_F32 : TEX_1D<"tex.1d.v4.u32.f32", Int32Regs, Float32Regs>;
2806 class TEX_1D_LEVEL_base<string inst, NVPTXRegClass outtype,
2807                         NVPTXRegClass intype, dag texsamp>
2808     : NVPTXInst<(outs outtype:$r, outtype:$g,
2809                       outtype:$b, outtype:$a),
2810                  !con(texsamp, (ins intype:$x, intype:$lod)),
2811                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}], $lod;",
2812                  []>;
2814 multiclass TEX_1D_LEVEL<string inst, NVPTXRegClass outtype,
2815                         NVPTXRegClass intype> {
2816   def _RR : TEX_1D_LEVEL_base<inst, outtype, intype,
2817                               (ins Int64Regs:$t, Int64Regs:$s)>;
2818   def _RI : TEX_1D_LEVEL_base<inst, outtype, intype,
2819                               (ins Int64Regs:$t, i64imm:$s)>;
2820   def _IR : TEX_1D_LEVEL_base<inst, outtype, intype,
2821                               (ins i64imm:$t, Int64Regs:$s)>;
2822   def _II : TEX_1D_LEVEL_base<inst, outtype, intype,
2823                               (ins i64imm:$t, i64imm:$s)>;
2826 defm TEX_1D_F32_F32_LEVEL :
2827   TEX_1D_LEVEL<"tex.level.1d.v4.f32.f32", Float32Regs, Float32Regs>;
2828 defm TEX_1D_S32_F32_LEVEL :
2829   TEX_1D_LEVEL<"tex.level.1d.v4.s32.f32", Int32Regs, Float32Regs>;
2830 defm TEX_1D_U32_F32_LEVEL :
2831   TEX_1D_LEVEL<"tex.level.1d.v4.u32.f32", Int32Regs, Float32Regs>;
2833 class TEX_1D_GRAD_base<string inst, NVPTXRegClass outtype,
2834                        NVPTXRegClass intype, dag texsamp>
2835     : NVPTXInst<(outs outtype:$r, outtype:$g,
2836                       outtype:$b, outtype:$a),
2837                  !con(texsamp, (ins intype:$x, intype:$gradx, intype:$grady)),
2838                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}],"
2839                         " \\{$gradx\\}, \\{$grady\\};",
2840                  []>;
2842 multiclass TEX_1D_GRAD<string inst, NVPTXRegClass outtype,
2843                        NVPTXRegClass intype> {
2844   def _RR : TEX_1D_GRAD_base<inst, outtype, intype,
2845                              (ins Int64Regs:$t, Int64Regs:$s)>;
2846   def _RI : TEX_1D_GRAD_base<inst, outtype, intype,
2847                              (ins Int64Regs:$t, i64imm:$s)>;
2848   def _IR : TEX_1D_GRAD_base<inst, outtype, intype,
2849                              (ins i64imm:$t, Int64Regs:$s)>;
2850   def _II : TEX_1D_GRAD_base<inst, outtype, intype,
2851                              (ins i64imm:$t, i64imm:$s)>;
2854 defm TEX_1D_F32_F32_GRAD
2855   : TEX_1D_GRAD<"tex.grad.1d.v4.f32.f32", Float32Regs, Float32Regs>;
2856 defm TEX_1D_S32_F32_GRAD
2857   : TEX_1D_GRAD<"tex.grad.1d.v4.s32.f32", Int32Regs, Float32Regs>;
2858 defm TEX_1D_U32_F32_GRAD
2859   : TEX_1D_GRAD<"tex.grad.1d.v4.u32.f32", Int32Regs, Float32Regs>;
2861 class TEX_1D_ARRAY_base<string inst, NVPTXRegClass outtype,
2862                         NVPTXRegClass intype, dag texsamp>
2863     : NVPTXInst<(outs outtype:$r, outtype:$g,
2864                       outtype:$b, outtype:$a),
2865                  !con(texsamp, (ins Int32Regs:$l, intype:$x)),
2866                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$l, $x\\}];",
2867                  []>;
2869 multiclass TEX_1D_ARRAY<string inst, NVPTXRegClass outtype,
2870                         NVPTXRegClass intype> {
2871   def _RR : TEX_1D_ARRAY_base<inst, outtype, intype,
2872                               (ins Int64Regs:$t, Int64Regs:$s)>;
2873   def _RI : TEX_1D_ARRAY_base<inst, outtype, intype,
2874                               (ins Int64Regs:$t, i64imm:$s)>;
2875   def _IR : TEX_1D_ARRAY_base<inst, outtype, intype,
2876                               (ins i64imm:$t, Int64Regs:$s)>;
2877   def _II : TEX_1D_ARRAY_base<inst, outtype, intype,
2878                               (ins i64imm:$t, i64imm:$s)>;
2881 defm TEX_1D_ARRAY_F32_F32
2882   : TEX_1D_ARRAY<"tex.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
2883 defm TEX_1D_ARRAY_F32_S32
2884   : TEX_1D_ARRAY<"tex.a1d.v4.f32.s32", Float32Regs, Int32Regs>;
2885 defm TEX_1D_ARRAY_S32_S32
2886   : TEX_1D_ARRAY<"tex.a1d.v4.s32.s32", Int32Regs, Int32Regs>;
2887 defm TEX_1D_ARRAY_S32_F32
2888   : TEX_1D_ARRAY<"tex.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
2889 defm TEX_1D_ARRAY_U32_S32
2890   : TEX_1D_ARRAY<"tex.a1d.v4.u32.s32", Int32Regs, Int32Regs>;
2891 defm TEX_1D_ARRAY_U32_F32
2892   : TEX_1D_ARRAY<"tex.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
2894 class TEX_1D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
2895                               NVPTXRegClass intype, dag texsamp>
2896     : NVPTXInst<(outs outtype:$r, outtype:$g,
2897                       outtype:$b, outtype:$a),
2898                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$lod)),
2899                  inst # " \t\\{$r, $g, $b, $a\\},"
2900                         " [$t, $s, \\{$l, $x\\}], $lod;",
2901                  []>;
2903 multiclass TEX_1D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
2904                               NVPTXRegClass intype> {
2905   def _RR : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
2906                                     (ins Int64Regs:$t, Int64Regs:$s)>;
2907   def _RI : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
2908                                     (ins Int64Regs:$t, i64imm:$s)>;
2909   def _IR : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
2910                                     (ins i64imm:$t, Int64Regs:$s)>;
2911   def _II : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
2912                                     (ins i64imm:$t, i64imm:$s)>;
2915 defm TEX_1D_ARRAY_F32_F32_LEVEL
2916   : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
2917 defm TEX_1D_ARRAY_S32_F32_LEVEL
2918   : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
2919 defm TEX_1D_ARRAY_U32_F32_LEVEL
2920   : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
2922 class TEX_1D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
2923                              NVPTXRegClass intype, dag texsamp>
2924     : NVPTXInst<(outs outtype:$r, outtype:$g,
2925                       outtype:$b, outtype:$a),
2926                  !con(texsamp, (ins Int32Regs:$l, intype:$x,
2927                                     intype:$gradx, intype:$grady)),
2928                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$l, $x\\}],"
2929                         " \\{$gradx\\}, \\{$grady\\};",
2930                  []>;
2932 multiclass TEX_1D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
2933                              NVPTXRegClass intype> {
2934   def _RR : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
2935                                    (ins Int64Regs:$t, Int64Regs:$s)>;
2936   def _RI : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
2937                                    (ins Int64Regs:$t, i64imm:$s)>;
2938   def _IR : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
2939                                    (ins i64imm:$t, Int64Regs:$s)>;
2940   def _II : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
2941                                    (ins i64imm:$t, i64imm:$s)>;
2944 defm TEX_1D_ARRAY_F32_F32_GRAD
2945   : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
2946 defm TEX_1D_ARRAY_S32_F32_GRAD
2947   : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
2948 defm TEX_1D_ARRAY_U32_F32_GRAD
2949   : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
2951 class TEX_2D_base<string inst, NVPTXRegClass outtype,
2952                   NVPTXRegClass intype, dag texsamp>
2953     : NVPTXInst<(outs outtype:$r, outtype:$g,
2954                       outtype:$b, outtype:$a),
2955                  !con(texsamp, (ins intype:$x, intype:$y)),
2956                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x, $y\\}];",
2957                  []>;
2959 multiclass TEX_2D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
2960   def _RR : TEX_2D_base<inst, outtype, intype,
2961                         (ins Int64Regs:$t, Int64Regs:$s)>;
2962   def _RI : TEX_2D_base<inst, outtype, intype, (ins Int64Regs:$t, i64imm:$s)>;
2963   def _IR : TEX_2D_base<inst, outtype, intype, (ins i64imm:$t, Int64Regs:$s)>;
2964   def _II : TEX_2D_base<inst, outtype, intype, (ins i64imm:$t, i64imm:$s)>;
2967 defm TEX_2D_F32_F32 : TEX_2D<"tex.2d.v4.f32.f32", Float32Regs, Float32Regs>;
2968 defm TEX_2D_F32_S32 : TEX_2D<"tex.2d.v4.f32.s32", Float32Regs, Int32Regs>;
2969 defm TEX_2D_S32_S32 : TEX_2D<"tex.2d.v4.s32.s32", Int32Regs, Int32Regs>;
2970 defm TEX_2D_S32_F32 : TEX_2D<"tex.2d.v4.s32.f32", Int32Regs, Float32Regs>;
2971 defm TEX_2D_U32_S32 : TEX_2D<"tex.2d.v4.u32.s32", Int32Regs, Int32Regs>;
2972 defm TEX_2D_U32_F32 : TEX_2D<"tex.2d.v4.u32.f32", Int32Regs, Float32Regs>;
2974 class TEX_2D_LEVEL_base<string inst, NVPTXRegClass outtype,
2975                         NVPTXRegClass intype, dag texsamp>
2976     : NVPTXInst<(outs outtype:$r, outtype:$g,
2977                       outtype:$b, outtype:$a),
2978                  !con(texsamp, (ins intype:$x, intype:$y, intype:$lod)),
2979                  inst # " \t\\{$r, $g, $b, $a\\},"
2980                         " [$t, $s, \\{$x, $y\\}], $lod;",
2981                  []>;
2983 multiclass TEX_2D_LEVEL<string inst, NVPTXRegClass outtype,
2984                         NVPTXRegClass intype> {
2985   def _RR : TEX_2D_LEVEL_base<inst, outtype, intype,
2986                               (ins Int64Regs:$t, Int64Regs:$s)>;
2987   def _RI : TEX_2D_LEVEL_base<inst, outtype, intype,
2988                               (ins Int64Regs:$t, i64imm:$s)>;
2989   def _IR : TEX_2D_LEVEL_base<inst, outtype, intype,
2990                               (ins i64imm:$t, Int64Regs:$s)>;
2991   def _II : TEX_2D_LEVEL_base<inst, outtype, intype,
2992                               (ins i64imm:$t, i64imm:$s)>;
2995 defm TEX_2D_F32_F32_LEVEL :
2996   TEX_2D_LEVEL<"tex.level.2d.v4.f32.f32", Float32Regs, Float32Regs>;
2997 defm TEX_2D_S32_F32_LEVEL :
2998   TEX_2D_LEVEL<"tex.level.2d.v4.s32.f32", Int32Regs, Float32Regs>;
2999 defm TEX_2D_U32_F32_LEVEL :
3000   TEX_2D_LEVEL<"tex.level.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3002 class TEX_2D_GRAD_base<string inst, NVPTXRegClass outtype,
3003                        NVPTXRegClass intype, dag texsamp>
3004     : NVPTXInst<(outs outtype:$r, outtype:$g,
3005                       outtype:$b, outtype:$a),
3006                  !con(texsamp, (ins intype:$x, intype:$y,
3007                                     intype:$gradx0, intype:$gradx1,
3008                                     intype:$grady0, intype:$grady1)),
3009                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x, $y\\}],"
3010                         " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
3011                  []>;
3013 multiclass TEX_2D_GRAD<string inst, NVPTXRegClass outtype,
3014                        NVPTXRegClass intype> {
3015   def _RR : TEX_2D_GRAD_base<inst, outtype, intype,
3016                               (ins Int64Regs:$t, Int64Regs:$s)>;
3017   def _RI : TEX_2D_GRAD_base<inst, outtype, intype,
3018                               (ins Int64Regs:$t, i64imm:$s)>;
3019   def _IR : TEX_2D_GRAD_base<inst, outtype, intype,
3020                               (ins i64imm:$t, Int64Regs:$s)>;
3021   def _II : TEX_2D_GRAD_base<inst, outtype, intype,
3022                               (ins i64imm:$t, i64imm:$s)>;
3025 defm TEX_2D_F32_F32_GRAD :
3026   TEX_2D_GRAD<"tex.grad.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3027 defm TEX_2D_S32_F32_GRAD :
3028   TEX_2D_GRAD<"tex.grad.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3029 defm TEX_2D_U32_F32_GRAD :
3030   TEX_2D_GRAD<"tex.grad.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3032 class TEX_2D_ARRAY_base<string inst, NVPTXRegClass outtype,
3033                         NVPTXRegClass intype, dag texsamp>
3034     : NVPTXInst<(outs outtype:$r, outtype:$g,
3035                       outtype:$b, outtype:$a),
3036                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y)),
3037                  inst # " \t\\{$r, $g, $b, $a\\},"
3038                         " [$t, $s, \\{$l, $x, $y, $y\\}];",
3039                  []>;
3041 multiclass TEX_2D_ARRAY<string inst, NVPTXRegClass outtype,
3042                         NVPTXRegClass intype> {
3043   def _RR : TEX_2D_ARRAY_base<inst, outtype, intype,
3044                               (ins Int64Regs:$t, Int64Regs:$s)>;
3045   def _RI : TEX_2D_ARRAY_base<inst, outtype, intype,
3046                               (ins Int64Regs:$t, i64imm:$s)>;
3047   def _IR : TEX_2D_ARRAY_base<inst, outtype, intype,
3048                               (ins i64imm:$t, Int64Regs:$s)>;
3049   def _II : TEX_2D_ARRAY_base<inst, outtype, intype,
3050                               (ins i64imm:$t, i64imm:$s)>;
3053 defm TEX_2D_ARRAY_F32_F32
3054   : TEX_2D_ARRAY<"tex.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
3055 defm TEX_2D_ARRAY_F32_S32
3056   : TEX_2D_ARRAY<"tex.a2d.v4.f32.s32", Float32Regs, Int32Regs>;
3057 defm TEX_2D_ARRAY_S32_S32
3058   : TEX_2D_ARRAY<"tex.a2d.v4.s32.s32", Int32Regs, Int32Regs>;
3059 defm TEX_2D_ARRAY_S32_F32
3060   : TEX_2D_ARRAY<"tex.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
3061 defm TEX_2D_ARRAY_U32_S32
3062   : TEX_2D_ARRAY<"tex.a2d.v4.u32.s32", Int32Regs, Int32Regs>;
3063 defm TEX_2D_ARRAY_U32_F32
3064   : TEX_2D_ARRAY<"tex.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
3066 class TEX_2D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
3067                               NVPTXRegClass intype, dag texsamp>
3068     : NVPTXInst<(outs outtype:$r, outtype:$g,
3069                       outtype:$b, outtype:$a),
3070                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
3071                                     intype:$lod)),
3072                  inst # " \t\\{$r, $g, $b, $a\\},"
3073                         " [$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
3074                  []>;
3076 multiclass TEX_2D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
3077                               NVPTXRegClass intype> {
3078   def _RR : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3079                               (ins Int64Regs:$t, Int64Regs:$s)>;
3080   def _RI : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3081                               (ins Int64Regs:$t, i64imm:$s)>;
3082   def _IR : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3083                               (ins i64imm:$t, Int64Regs:$s)>;
3084   def _II : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3085                               (ins i64imm:$t, i64imm:$s)>;
3088 defm TEX_2D_ARRAY_F32_F32_LEVEL
3089   : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
3090 defm TEX_2D_ARRAY_S32_F32_LEVEL
3091   : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
3092 defm TEX_2D_ARRAY_U32_F32_LEVEL
3093   : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
3095 class TEX_2D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
3096                              NVPTXRegClass intype, dag texsamp>
3097     : NVPTXInst<(outs outtype:$r, outtype:$g,
3098                       outtype:$b, outtype:$a),
3099                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
3100                                     intype:$gradx0, intype:$gradx1,
3101                                     intype:$grady0, intype:$grady1)),
3102                  inst # " \t\\{$r, $g, $b, $a\\},"
3103                         " [$t, $s, \\{$l, $x, $y, $y\\}],"
3104                         " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
3105                  []>;
3107 multiclass TEX_2D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
3108                              NVPTXRegClass intype> {
3109   def _RR : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
3110                               (ins Int64Regs:$t, Int64Regs:$s)>;
3111   def _RI : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
3112                               (ins Int64Regs:$t, i64imm:$s)>;
3113   def _IR : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
3114                               (ins i64imm:$t, Int64Regs:$s)>;
3115   def _II : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
3116                               (ins i64imm:$t, i64imm:$s)>;
3119 defm TEX_2D_ARRAY_F32_F32_GRAD
3120   : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
3121 defm TEX_2D_ARRAY_S32_F32_GRAD
3122   : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
3123 defm TEX_2D_ARRAY_U32_F32_GRAD
3124   : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
3126 class TEX_3D_base<string inst, NVPTXRegClass outtype,
3127                   NVPTXRegClass intype, dag texsamp>
3128     : NVPTXInst<(outs outtype:$r, outtype:$g,
3129                       outtype:$b, outtype:$a),
3130                  !con(texsamp, (ins intype:$x, intype:$y, intype:$z)),
3131                  inst # " \t\\{$r, $g, $b, $a\\},"
3132                         " [$t, $s, \\{$x, $y, $z, $z\\}];",
3133                  []>;
3135 multiclass TEX_3D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
3136   def _RR : TEX_3D_base<inst, outtype, intype,
3137                               (ins Int64Regs:$t, Int64Regs:$s)>;
3138   def _RI : TEX_3D_base<inst, outtype, intype,
3139                               (ins Int64Regs:$t, i64imm:$s)>;
3140   def _IR : TEX_3D_base<inst, outtype, intype,
3141                               (ins i64imm:$t, Int64Regs:$s)>;
3142   def _II : TEX_3D_base<inst, outtype, intype,
3143                               (ins i64imm:$t, i64imm:$s)>;
3146 defm TEX_3D_F32_F32 : TEX_3D<"tex.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3147 defm TEX_3D_F32_S32 : TEX_3D<"tex.3d.v4.f32.s32", Float32Regs, Int32Regs>;
3148 defm TEX_3D_S32_S32 : TEX_3D<"tex.3d.v4.s32.s32", Int32Regs, Int32Regs>;
3149 defm TEX_3D_S32_F32 : TEX_3D<"tex.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3150 defm TEX_3D_U32_S32 : TEX_3D<"tex.3d.v4.u32.s32", Int32Regs, Int32Regs>;
3151 defm TEX_3D_U32_F32 : TEX_3D<"tex.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3153 class TEX_3D_LEVEL_base<string inst, NVPTXRegClass outtype,
3154                         NVPTXRegClass intype, dag texsamp>
3155     : NVPTXInst<(outs outtype:$r, outtype:$g,
3156                       outtype:$b, outtype:$a),
3157                  !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
3158                                     intype:$lod)),
3159                  inst # " \t\\{$r, $g, $b, $a\\},"
3160                         " [$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
3161                  []>;
3163 multiclass TEX_3D_LEVEL<string inst, NVPTXRegClass outtype,
3164                         NVPTXRegClass intype> {
3165   def _RR : TEX_3D_LEVEL_base<inst, outtype, intype,
3166                               (ins Int64Regs:$t, Int64Regs:$s)>;
3167   def _RI : TEX_3D_LEVEL_base<inst, outtype, intype,
3168                               (ins Int64Regs:$t, i64imm:$s)>;
3169   def _IR : TEX_3D_LEVEL_base<inst, outtype, intype,
3170                               (ins i64imm:$t, Int64Regs:$s)>;
3171   def _II : TEX_3D_LEVEL_base<inst, outtype, intype,
3172                               (ins i64imm:$t, i64imm:$s)>;
3175 defm TEX_3D_F32_F32_LEVEL
3176   : TEX_3D_LEVEL<"tex.level.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3177 defm TEX_3D_S32_F32_LEVEL
3178   : TEX_3D_LEVEL<"tex.level.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3179 defm TEX_3D_U32_F32_LEVEL
3180   : TEX_3D_LEVEL<"tex.level.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3182 class TEX_3D_GRAD_base<string inst, NVPTXRegClass outtype,
3183                        NVPTXRegClass intype, dag texsamp>
3184     : NVPTXInst<(outs outtype:$r, outtype:$g,
3185                       outtype:$b, outtype:$a),
3186                  !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
3187                                     intype :$gradx0, intype:$gradx1,
3188                                     intype:$gradx2, intype:$grady0,
3189                                     intype:$grady1, intype:$grady2)),
3190                  inst # " \t\\{$r, $g, $b, $a\\},"
3191                         " [$t, $s, \\{$x, $y, $z, $z\\}],"
3192                         " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
3193                         " \\{$grady0, $grady1, $grady2, $grady2\\};",
3194                  []>;
3196 multiclass TEX_3D_GRAD<string inst, NVPTXRegClass outtype,
3197                        NVPTXRegClass intype> {
3198   def _RR : TEX_3D_GRAD_base<inst, outtype, intype,
3199                              (ins Int64Regs:$t, Int64Regs:$s)>;
3200   def _RI : TEX_3D_GRAD_base<inst, outtype, intype,
3201                              (ins Int64Regs:$t, i64imm:$s)>;
3202   def _IR : TEX_3D_GRAD_base<inst, outtype, intype,
3203                              (ins i64imm:$t, Int64Regs:$s)>;
3204   def _II : TEX_3D_GRAD_base<inst, outtype, intype,
3205                              (ins i64imm:$t, i64imm:$s)>;
3208 defm TEX_3D_F32_F32_GRAD
3209   : TEX_3D_GRAD<"tex.grad.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3210 defm TEX_3D_S32_F32_GRAD
3211   : TEX_3D_GRAD<"tex.grad.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3212 defm TEX_3D_U32_F32_GRAD
3213   : TEX_3D_GRAD<"tex.grad.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3215 class TEX_CUBE_base<string inst, NVPTXRegClass outtype,
3216                     NVPTXRegClass intype, dag texsamp>
3217     : NVPTXInst<(outs outtype:$r, outtype:$g,
3218                       outtype:$b, outtype:$a),
3219                  !con(texsamp, (ins intype:$x, intype:$y, intype:$z)),
3220                  inst # " \t\\{$r, $g, $b, $a\\},"
3221                         " [$t, $s, \\{$x, $y, $z, $z\\}];",
3222                  []>;
3224 multiclass TEX_CUBE<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
3225   def _RR : TEX_CUBE_base<inst, outtype, intype,
3226                           (ins Int64Regs:$t, Int64Regs:$s)>;
3227   def _RI : TEX_CUBE_base<inst, outtype, intype,
3228                           (ins Int64Regs:$t, i64imm:$s)>;
3229   def _IR : TEX_CUBE_base<inst, outtype, intype,
3230                           (ins i64imm:$t, Int64Regs:$s)>;
3231   def _II : TEX_CUBE_base<inst, outtype, intype,
3232                           (ins i64imm:$t, i64imm:$s)>;
3235 defm TEX_CUBE_F32_F32
3236   : TEX_CUBE<"tex.cube.v4.f32.f32", Float32Regs, Float32Regs>;
3237 defm TEX_CUBE_S32_F32
3238   : TEX_CUBE<"tex.cube.v4.s32.f32", Int32Regs, Float32Regs>;
3239 defm TEX_CUBE_U32_F32
3240   : TEX_CUBE<"tex.cube.v4.u32.f32", Int32Regs, Float32Regs>;
3242 class TEX_CUBE_LEVEL_base<string inst, NVPTXRegClass outtype,
3243                           NVPTXRegClass intype, dag texsamp>
3244     : NVPTXInst<(outs outtype:$r, outtype:$g,
3245                       outtype:$b, outtype:$a),
3246                  !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
3247                                     intype:$lod)),
3248                  inst # " \t\\{$r, $g, $b, $a\\},"
3249                         " [$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
3250                  []>;
3252 multiclass TEX_CUBE_LEVEL<string inst, NVPTXRegClass outtype,
3253                           NVPTXRegClass intype> {
3254   def _RR : TEX_CUBE_LEVEL_base<inst, outtype, intype,
3255                                 (ins Int64Regs:$t, Int64Regs:$s)>;
3256   def _RI : TEX_CUBE_LEVEL_base<inst, outtype, intype,
3257                                 (ins Int64Regs:$t, i64imm:$s)>;
3258   def _IR : TEX_CUBE_LEVEL_base<inst, outtype, intype,
3259                                 (ins i64imm:$t, Int64Regs:$s)>;
3260   def _II : TEX_CUBE_LEVEL_base<inst, outtype, intype,
3261                                 (ins i64imm:$t, i64imm:$s)>;
3264 defm TEX_CUBE_F32_F32_LEVEL
3265   : TEX_CUBE_LEVEL<"tex.level.cube.v4.f32.f32", Float32Regs, Float32Regs>;
3266 defm TEX_CUBE_S32_F32_LEVEL
3267   : TEX_CUBE_LEVEL<"tex.level.cube.v4.s32.f32", Int32Regs, Float32Regs>;
3268 defm TEX_CUBE_U32_F32_LEVEL
3269   : TEX_CUBE_LEVEL<"tex.level.cube.v4.u32.f32", Int32Regs, Float32Regs>;
3271 class TEX_CUBE_ARRAY_base<string inst, NVPTXRegClass outtype,
3272                           NVPTXRegClass intype, dag texsamp>
3273     : NVPTXInst<(outs outtype:$r, outtype:$g,
3274                       outtype:$b, outtype:$a),
3275                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
3276                                     intype:$z)),
3277                  inst # " \t\\{$r, $g, $b, $a\\},"
3278                         " [$t, $s, \\{$l, $x, $y, $z\\}];",
3279                  []>;
3281 multiclass TEX_CUBE_ARRAY<string inst, NVPTXRegClass outtype,
3282                           NVPTXRegClass intype> {
3283   def _RR : TEX_CUBE_ARRAY_base<inst, outtype, intype,
3284                                 (ins Int64Regs:$t, Int64Regs:$s)>;
3285   def _RI : TEX_CUBE_ARRAY_base<inst, outtype, intype,
3286                                 (ins Int64Regs:$t, i64imm:$s)>;
3287   def _IR : TEX_CUBE_ARRAY_base<inst, outtype, intype,
3288                                 (ins i64imm:$t, Int64Regs:$s)>;
3289   def _II : TEX_CUBE_ARRAY_base<inst, outtype, intype,
3290                                 (ins i64imm:$t, i64imm:$s)>;
3293 defm TEX_CUBE_ARRAY_F32_F32
3294   : TEX_CUBE_ARRAY<"tex.acube.v4.f32.f32", Float32Regs, Float32Regs>;
3295 defm TEX_CUBE_ARRAY_S32_F32
3296   : TEX_CUBE_ARRAY<"tex.acube.v4.s32.f32", Int32Regs, Float32Regs>;
3297 defm TEX_CUBE_ARRAY_U32_F32
3298   : TEX_CUBE_ARRAY<"tex.acube.v4.u32.f32", Int32Regs, Float32Regs>;
3300 class TEX_CUBE_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
3301                                 NVPTXRegClass intype, dag texsamp>
3302     : NVPTXInst<(outs outtype:$r, outtype:$g,
3303                       outtype:$b, outtype:$a),
3304                  !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
3305                                     intype:$z, intype:$lod)),
3306                  inst # " \t\\{$r, $g, $b, $a\\},"
3307                         " [$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
3308                  []>;
3310 multiclass TEX_CUBE_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
3311                                 NVPTXRegClass intype> {
3312   def _RR : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3313                                       (ins Int64Regs:$t, Int64Regs:$s)>;
3314   def _RI : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3315                                       (ins Int64Regs:$t, i64imm:$s)>;
3316   def _IR : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3317                                       (ins i64imm:$t, Int64Regs:$s)>;
3318   def _II : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3319                                       (ins i64imm:$t, i64imm:$s)>;
3322 defm TEX_CUBE_ARRAY_F32_F32_LEVEL
3323   : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.f32.f32",
3324                          Float32Regs, Float32Regs>;
3325 defm TEX_CUBE_ARRAY_S32_F32_LEVEL
3326   : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.s32.f32",
3327                          Int32Regs, Float32Regs>;
3328 defm TEX_CUBE_ARRAY_U32_F32_LEVEL
3329   : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.u32.f32",
3330                          Int32Regs, Float32Regs>;
3332 class TLD4_2D_base<string inst, NVPTXRegClass outtype,
3333                    NVPTXRegClass intype, dag texsamp>
3334     : NVPTXInst<(outs outtype:$v0, outtype:$v1,
3335                       outtype:$v2, outtype:$v3),
3336                  !con(texsamp, (ins intype:$x, intype:$y)),
3337                  inst # " \t\\{$v0, $v1, $v2, $v3\\}, [$t, $s, \\{$x, $y\\}];",
3338                  []>;
3340 multiclass TLD4_2D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
3341   def _RR : TLD4_2D_base<inst, outtype, intype,
3342                          (ins Int64Regs:$t, Int64Regs:$s)>;
3343   def _RI : TLD4_2D_base<inst, outtype, intype,
3344                          (ins Int64Regs:$t, i64imm:$s)>;
3345   def _IR : TLD4_2D_base<inst, outtype, intype,
3346                          (ins i64imm:$t, Int64Regs:$s)>;
3347   def _II : TLD4_2D_base<inst, outtype, intype,
3348                          (ins i64imm:$t, i64imm:$s)>;
3351 defm TLD4_R_2D_F32_F32
3352   : TLD4_2D<"tld4.r.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3353 defm TLD4_G_2D_F32_F32
3354   : TLD4_2D<"tld4.g.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3355 defm TLD4_B_2D_F32_F32
3356   : TLD4_2D<"tld4.b.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3357 defm TLD4_A_2D_F32_F32
3358   : TLD4_2D<"tld4.a.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3360 defm TLD4_R_2D_S32_F32
3361   : TLD4_2D<"tld4.r.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3362 defm TLD4_G_2D_S32_F32
3363   : TLD4_2D<"tld4.g.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3364 defm TLD4_B_2D_S32_F32
3365   : TLD4_2D<"tld4.b.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3366 defm TLD4_A_2D_S32_F32
3367   : TLD4_2D<"tld4.a.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3369 defm TLD4_R_2D_U32_F32
3370   : TLD4_2D<"tld4.r.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3371 defm TLD4_G_2D_U32_F32
3372   : TLD4_2D<"tld4.g.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3373 defm TLD4_B_2D_U32_F32
3374   : TLD4_2D<"tld4.b.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3375 defm TLD4_A_2D_U32_F32
3376   : TLD4_2D<"tld4.a.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3381 // texmode_unified
3382 let IsTex = true, IsTexModeUnified = true in {
3383 // Texture fetch instructions using handles
3385 class TEX_UNIFIED_1D_base<string inst, NVPTXRegClass outtype,
3386                           NVPTXRegClass intype, dag tex>
3387     : NVPTXInst<(outs outtype:$r, outtype:$g,
3388                       outtype:$b, outtype:$a),
3389                  !con(tex, (ins intype:$x)),
3390                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
3391                  []>;
3393 multiclass TEX_UNIFIED_1D<string inst, NVPTXRegClass outtype,
3394                           NVPTXRegClass intype> {
3395   def _R : TEX_UNIFIED_1D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3396   def _I : TEX_UNIFIED_1D_base<inst, outtype, intype, (ins i64imm:$t)>;
3399 defm TEX_UNIFIED_1D_F32_S32
3400   : TEX_UNIFIED_1D<"tex.1d.v4.f32.s32", Float32Regs, Int32Regs>;
3401 defm TEX_UNIFIED_1D_F32_F32
3402   : TEX_UNIFIED_1D<"tex.1d.v4.f32.f32", Float32Regs, Float32Regs>;
3403 defm TEX_UNIFIED_1D_S32_S32
3404   : TEX_UNIFIED_1D<"tex.1d.v4.s32.s32", Int32Regs, Int32Regs>;
3405 defm TEX_UNIFIED_1D_S32_F32
3406   : TEX_UNIFIED_1D<"tex.1d.v4.s32.f32", Int32Regs, Float32Regs>;
3407 defm TEX_UNIFIED_1D_U32_S32
3408   : TEX_UNIFIED_1D<"tex.1d.v4.u32.s32", Int32Regs, Int32Regs>;
3409 defm TEX_UNIFIED_1D_U32_F32
3410   : TEX_UNIFIED_1D<"tex.1d.v4.u32.f32", Int32Regs, Float32Regs>;
3412 class TEX_UNIFIED_1D_LEVEL_base<string inst, NVPTXRegClass outtype,
3413                                 NVPTXRegClass intype, dag tex>
3414     : NVPTXInst<(outs outtype:$r, outtype:$g,
3415                       outtype:$b, outtype:$a),
3416                  !con(tex, (ins intype:$x, intype:$lod)),
3417                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}], $lod;",
3418                  []>;
3420 multiclass TEX_UNIFIED_1D_LEVEL<string inst, NVPTXRegClass outtype,
3421                                 NVPTXRegClass intype> {
3422   def _R : TEX_UNIFIED_1D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3423   def _I : TEX_UNIFIED_1D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
3426 defm TEX_UNIFIED_1D_F32_F32_LEVEL
3427   : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.f32.f32", Float32Regs, Float32Regs>;
3428 defm TEX_UNIFIED_1D_S32_F32_LEVEL
3429   : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.s32.f32", Int32Regs, Float32Regs>;
3430 defm TEX_UNIFIED_1D_U32_F32_LEVEL
3431   : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.u32.f32", Int32Regs, Float32Regs>;
3433 class TEX_UNIFIED_1D_GRAD_base<string inst, NVPTXRegClass outtype,
3434                                NVPTXRegClass intype, dag tex>
3435     : NVPTXInst<(outs outtype:$r, outtype:$g,
3436                       outtype:$b, outtype:$a),
3437                  !con(tex, (ins intype:$x, intype:$gradx, intype:$grady)),
3438                  inst # " \t\\{$r, $g, $b, $a\\},"
3439                         " [$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
3440                  []>;
3442 multiclass TEX_UNIFIED_1D_GRAD<string inst, NVPTXRegClass outtype,
3443                                NVPTXRegClass intype> {
3444   def _R : TEX_UNIFIED_1D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3445   def _I : TEX_UNIFIED_1D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
3448 defm TEX_UNIFIED_1D_F32_F32_GRAD
3449   : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.f32.f32", Float32Regs, Float32Regs>;
3450 defm TEX_UNIFIED_1D_S32_F32_GRAD
3451   : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.s32.f32", Int32Regs, Float32Regs>;
3452 defm TEX_UNIFIED_1D_U32_F32_GRAD
3453   : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.u32.f32", Int32Regs, Float32Regs>;
3455 class TEX_UNIFIED_1D_ARRAY_base<string inst, NVPTXRegClass outtype,
3456                                 NVPTXRegClass intype, dag tex>
3457     : NVPTXInst<(outs outtype:$r, outtype:$g,
3458                       outtype:$b, outtype:$a),
3459                  !con(tex, (ins Int32Regs:$l, intype:$x)),
3460                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x\\}];",
3461                  []>;
3463 multiclass TEX_UNIFIED_1D_ARRAY<string inst, NVPTXRegClass outtype,
3464                                 NVPTXRegClass intype> {
3465   def _R : TEX_UNIFIED_1D_ARRAY_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3466   def _I : TEX_UNIFIED_1D_ARRAY_base<inst, outtype, intype, (ins i64imm:$t)>;
3469 defm TEX_UNIFIED_1D_ARRAY_F32_S32
3470   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.f32.s32", Float32Regs, Int32Regs>;
3471 defm TEX_UNIFIED_1D_ARRAY_F32_F32
3472   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
3473 defm TEX_UNIFIED_1D_ARRAY_S32_S32
3474   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.s32.s32", Int32Regs, Int32Regs>;
3475 defm TEX_UNIFIED_1D_ARRAY_S32_F32
3476   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
3477 defm TEX_UNIFIED_1D_ARRAY_U32_S32
3478   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.u32.s32", Int32Regs, Int32Regs>;
3479 defm TEX_UNIFIED_1D_ARRAY_U32_F32
3480   : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
3482 class TEX_UNIFIED_1D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
3483                                       NVPTXRegClass intype, dag tex>
3484     : NVPTXInst<(outs outtype:$r, outtype:$g,
3485                       outtype:$b, outtype:$a),
3486                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$lod)),
3487                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x\\}], $lod;",
3488                  []>;
3490 multiclass TEX_UNIFIED_1D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
3491                                       NVPTXRegClass intype> {
3492   def _R : TEX_UNIFIED_1D_ARRAY_LEVEL_base<inst, outtype, intype,
3493                                            (ins Int64Regs:$t)>;
3494   def _I : TEX_UNIFIED_1D_ARRAY_LEVEL_base<inst, outtype, intype,
3495                                            (ins i64imm:$t)>;
3498 defm TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
3499   : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.f32.f32",
3500                                Float32Regs, Float32Regs>;
3501 defm TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
3502   : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.s32.f32",
3503                                Int32Regs, Float32Regs>;
3504 defm TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
3505   : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.u32.f32",
3506                                Int32Regs, Float32Regs>;
3508 class TEX_UNIFIED_1D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
3509                                      NVPTXRegClass intype, dag tex>
3510     : NVPTXInst<(outs outtype:$r, outtype:$g,
3511                       outtype:$b, outtype:$a),
3512                  !con(tex, (ins Int32Regs:$l, intype:$x,
3513                                 intype:$gradx, intype:$grady)),
3514                  inst # " \t\\{$r, $g, $b, $a\\},"
3515                         "  [$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
3516                  []>;
3518 multiclass TEX_UNIFIED_1D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
3519                                      NVPTXRegClass intype> {
3520   def _R : TEX_UNIFIED_1D_ARRAY_GRAD_base<inst, outtype, intype,
3521                                           (ins Int64Regs:$t)>;
3522   def _I : TEX_UNIFIED_1D_ARRAY_GRAD_base<inst, outtype, intype,
3523                                           (ins i64imm:$t)>;
3526 defm TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
3527   : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.f32.f32",
3528                               Float32Regs, Float32Regs>;
3529 defm TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
3530   : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.s32.f32",
3531                               Int32Regs, Float32Regs>;
3532 defm TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
3533   : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.u32.f32",
3534                               Int32Regs, Float32Regs>;
3536 class TEX_UNIFIED_2D_base<string inst, NVPTXRegClass outtype,
3537                           NVPTXRegClass intype, dag tex>
3538     : NVPTXInst<(outs outtype:$r, outtype:$g,
3539                       outtype:$b, outtype:$a),
3540                  !con(tex, (ins intype:$x, intype:$y)),
3541                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}];",
3542                  []>;
3544 multiclass TEX_UNIFIED_2D<string inst, NVPTXRegClass outtype,
3545                           NVPTXRegClass intype> {
3546   def _R : TEX_UNIFIED_2D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3547   def _I : TEX_UNIFIED_2D_base<inst, outtype, intype, (ins i64imm:$t)>;
3550 defm TEX_UNIFIED_2D_F32_S32
3551   : TEX_UNIFIED_2D<"tex.2d.v4.f32.s32", Float32Regs, Int32Regs>;
3552 defm TEX_UNIFIED_2D_F32_F32
3553   : TEX_UNIFIED_2D<"tex.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3554 defm TEX_UNIFIED_2D_S32_S32
3555   : TEX_UNIFIED_2D<"tex.2d.v4.s32.s32", Int32Regs, Int32Regs>;
3556 defm TEX_UNIFIED_2D_S32_F32
3557   : TEX_UNIFIED_2D<"tex.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3558 defm TEX_UNIFIED_2D_U32_S32
3559   : TEX_UNIFIED_2D<"tex.2d.v4.u32.s32", Int32Regs, Int32Regs>;
3560 defm TEX_UNIFIED_2D_U32_F32
3561   : TEX_UNIFIED_2D<"tex.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3563 class TEX_UNIFIED_2D_LEVEL_base<string inst, NVPTXRegClass outtype,
3564                                 NVPTXRegClass intype, dag tex>
3565     : NVPTXInst<(outs outtype:$r, outtype:$g,
3566                       outtype:$b, outtype:$a),
3567                  !con(tex, (ins intype:$x, intype:$y, intype:$lod)),
3568                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}], $lod;",
3569                  []>;
3571 multiclass TEX_UNIFIED_2D_LEVEL<string inst, NVPTXRegClass outtype,
3572                                 NVPTXRegClass intype> {
3573   def _R : TEX_UNIFIED_2D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3574   def _I : TEX_UNIFIED_2D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
3577 defm TEX_UNIFIED_2D_F32_F32_LEVEL
3578   : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3579 defm TEX_UNIFIED_2D_S32_F32_LEVEL
3580   : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3581 defm TEX_UNIFIED_2D_U32_F32_LEVEL
3582   : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3584 class TEX_UNIFIED_2D_GRAD_base<string inst, NVPTXRegClass outtype,
3585                                NVPTXRegClass intype, dag tex>
3586     : NVPTXInst<(outs outtype:$r, outtype:$g,
3587                       outtype:$b, outtype:$a),
3588                  !con(tex, (ins intype:$x, intype:$y,
3589                                 intype:$gradx0, intype:$gradx1,
3590                                 intype:$grady0, intype:$grady1)),
3591                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}],"
3592                         " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
3593                  []>;
3594 multiclass TEX_UNIFIED_2D_GRAD<string inst, NVPTXRegClass outtype,
3595                                NVPTXRegClass intype> {
3596   def _R : TEX_UNIFIED_2D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3597   def _I : TEX_UNIFIED_2D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
3600 defm TEX_UNIFIED_2D_F32_F32_GRAD
3601   : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3602 defm TEX_UNIFIED_2D_S32_F32_GRAD
3603   : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3604 defm TEX_UNIFIED_2D_U32_F32_GRAD
3605   : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3607 class TEX_UNIFIED_2D_ARRAY_base<string inst, NVPTXRegClass outtype,
3608                                 NVPTXRegClass intype, dag tex>
3609     : NVPTXInst<(outs outtype:$r, outtype:$g,
3610                       outtype:$b, outtype:$a),
3611                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y)),
3612                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $y\\}];",
3613                  []>;
3614 multiclass TEX_UNIFIED_2D_ARRAY<string inst, NVPTXRegClass outtype,
3615                                 NVPTXRegClass intype> {
3616   def _R : TEX_UNIFIED_2D_ARRAY_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3617   def _I : TEX_UNIFIED_2D_ARRAY_base<inst, outtype, intype, (ins i64imm:$t)>;
3620 defm TEX_UNIFIED_2D_ARRAY_F32_S32
3621   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.f32.s32", Float32Regs, Int32Regs>;
3622 defm TEX_UNIFIED_2D_ARRAY_F32_F32
3623   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
3624 defm TEX_UNIFIED_2D_ARRAY_S32_S32
3625   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.s32.s32", Int32Regs, Int32Regs>;
3626 defm TEX_UNIFIED_2D_ARRAY_S32_F32
3627   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
3628 defm TEX_UNIFIED_2D_ARRAY_U32_S32
3629   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.u32.s32", Int32Regs, Int32Regs>;
3630 defm TEX_UNIFIED_2D_ARRAY_U32_F32
3631   : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
3633 class TEX_UNIFIED_2D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
3634                                       NVPTXRegClass intype, dag tex>
3635     : NVPTXInst<(outs outtype:$r, outtype:$g,
3636                       outtype:$b, outtype:$a),
3637                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y,
3638                                 intype:$lod)),
3639                  inst # " \t\\{$r, $g, $b, $a\\},"
3640                         "  [$t, \\{$l, $x, $y, $y\\}], $lod;",
3641                  []>;
3642 multiclass TEX_UNIFIED_2D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
3643                                       NVPTXRegClass intype> {
3644   def _R : TEX_UNIFIED_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3645                                            (ins Int64Regs:$t)>;
3646   def _I : TEX_UNIFIED_2D_ARRAY_LEVEL_base<inst, outtype, intype,
3647                                            (ins i64imm:$t)>;
3650 defm TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
3651   : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.f32.f32",
3652                                Float32Regs, Float32Regs>;
3653 defm TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
3654   : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.s32.f32",
3655                                Int32Regs, Float32Regs>;
3656 defm TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
3657   : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.u32.f32",
3658                                Int32Regs, Float32Regs>;
3660 class TEX_UNIFIED_2D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
3661                                      NVPTXRegClass intype, dag tex>
3662     : NVPTXInst<(outs outtype:$r, outtype:$g,
3663                       outtype:$b, outtype:$a),
3664                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y,
3665                                 intype:$gradx0, intype:$gradx1,
3666                                 intype:$grady0, intype:$grady1)),
3667                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $y\\}],"
3668                         " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
3669                  []>;
3670 multiclass TEX_UNIFIED_2D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
3671                                      NVPTXRegClass intype> {
3672   def _R : TEX_UNIFIED_2D_ARRAY_GRAD_base<inst, outtype, intype,
3673                                           (ins Int64Regs:$t)>;
3674   def _I : TEX_UNIFIED_2D_ARRAY_GRAD_base<inst, outtype, intype,
3675                                           (ins i64imm:$t)>;
3678 defm TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
3679   : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.f32.f32",
3680                               Float32Regs, Float32Regs>;
3681 defm TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
3682   : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.s32.f32",
3683                               Int32Regs, Float32Regs>;
3684 defm TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
3685   : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.u32.f32",
3686                               Int32Regs, Float32Regs>;
3688 class TEX_UNIFIED_3D_base<string inst, NVPTXRegClass outtype,
3689                           NVPTXRegClass intype, dag tex>
3690     : NVPTXInst<(outs outtype:$r, outtype:$g,
3691                       outtype:$b, outtype:$a),
3692                  !con(tex, (ins intype:$x, intype:$y, intype:$z)),
3693                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}];",
3694                  []>;
3695 multiclass TEX_UNIFIED_3D<string inst, NVPTXRegClass outtype,
3696                           NVPTXRegClass intype> {
3697   def _R : TEX_UNIFIED_3D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3698   def _I : TEX_UNIFIED_3D_base<inst, outtype, intype, (ins i64imm:$t)>;
3701 defm TEX_UNIFIED_3D_F32_S32
3702   : TEX_UNIFIED_3D<"tex.3d.v4.f32.s32", Float32Regs, Int32Regs>;
3703 defm TEX_UNIFIED_3D_F32_F32
3704   : TEX_UNIFIED_3D<"tex.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3705 defm TEX_UNIFIED_3D_S32_S32
3706   : TEX_UNIFIED_3D<"tex.3d.v4.s32.s32", Int32Regs, Int32Regs>;
3707 defm TEX_UNIFIED_3D_S32_F32
3708   : TEX_UNIFIED_3D<"tex.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3709 defm TEX_UNIFIED_3D_U32_S32
3710   : TEX_UNIFIED_3D<"tex.3d.v4.u32.s32", Int32Regs, Int32Regs>;
3711 defm TEX_UNIFIED_3D_U32_F32
3712   : TEX_UNIFIED_3D<"tex.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3714 class TEX_UNIFIED_3D_LEVEL_base<string inst, NVPTXRegClass outtype,
3715                                 NVPTXRegClass intype, dag tex>
3716     : NVPTXInst<(outs outtype:$r, outtype:$g,
3717                       outtype:$b, outtype:$a),
3718                  !con(tex, (ins intype:$x, intype:$y, intype:$z, intype:$lod)),
3719                  inst # " \t\\{$r, $g, $b, $a\\},"
3720                         " [$t, \\{$x, $y, $z, $z\\}], $lod;",
3721                  []>;
3722 multiclass TEX_UNIFIED_3D_LEVEL<string inst, NVPTXRegClass outtype,
3723                                 NVPTXRegClass intype> {
3724   def _R : TEX_UNIFIED_3D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3725   def _I : TEX_UNIFIED_3D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
3728 defm TEX_UNIFIED_3D_F32_F32_LEVEL
3729   : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3730 defm TEX_UNIFIED_3D_S32_F32_LEVEL
3731   : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3732 defm TEX_UNIFIED_3D_U32_F32_LEVEL
3733   : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3735 class TEX_UNIFIED_3D_GRAD_base<string inst, NVPTXRegClass outtype,
3736                                NVPTXRegClass intype, dag tex>
3737     : NVPTXInst<(outs outtype:$r, outtype:$g,
3738                       outtype:$b, outtype:$a),
3739                  !con(tex, (ins intype:$x, intype:$y, intype:$z,
3740                                 intype:$gradx0, intype:$gradx1,
3741                                 intype:$gradx2, intype:$grady0,
3742                                 intype:$grady1, intype:$grady2)),
3743                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}],"
3744                         " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
3745                         " \\{$grady0, $grady1, $grady2, $grady2\\};",
3746                  []>;
3747 multiclass TEX_UNIFIED_3D_GRAD<string inst, NVPTXRegClass outtype,
3748                                NVPTXRegClass intype> {
3749   def _R : TEX_UNIFIED_3D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3750   def _I : TEX_UNIFIED_3D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
3753 defm TEX_UNIFIED_3D_F32_F32_GRAD
3754   : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.f32.f32", Float32Regs, Float32Regs>;
3755 defm TEX_UNIFIED_3D_S32_F32_GRAD
3756   : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.s32.f32", Int32Regs, Float32Regs>;
3757 defm TEX_UNIFIED_3D_U32_F32_GRAD
3758   : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.u32.f32", Int32Regs, Float32Regs>;
3760 class TEX_UNIFIED_CUBE_base<string inst, NVPTXRegClass outtype,
3761                             NVPTXRegClass intype, dag tex>
3762     : NVPTXInst<(outs outtype:$r, outtype:$g,
3763                       outtype:$b, outtype:$a),
3764                  !con(tex, (ins intype:$x, intype:$y, intype:$z)),
3765                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}];",
3766                  []>;
3767 multiclass TEX_UNIFIED_CUBE<string inst, NVPTXRegClass outtype,
3768                             NVPTXRegClass intype> {
3769   def _R : TEX_UNIFIED_CUBE_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3770   def _I : TEX_UNIFIED_CUBE_base<inst, outtype, intype, (ins i64imm:$t)>;
3773 defm TEX_UNIFIED_CUBE_F32_F32
3774   : TEX_UNIFIED_CUBE<"tex.cube.v4.f32.f32", Float32Regs, Float32Regs>;
3775 defm TEX_UNIFIED_CUBE_S32_F32
3776   : TEX_UNIFIED_CUBE<"tex.cube.v4.s32.f32", Int32Regs, Float32Regs>;
3777 defm TEX_UNIFIED_CUBE_U32_F32
3778   : TEX_UNIFIED_CUBE<"tex.cube.v4.u32.f32", Int32Regs, Float32Regs>;
3780 class TEX_UNIFIED_CUBE_LEVEL_base<string inst, NVPTXRegClass outtype,
3781                                   NVPTXRegClass intype, dag tex>
3782     : NVPTXInst<(outs outtype:$r, outtype:$g,
3783                       outtype:$b, outtype:$a),
3784                  !con(tex, (ins intype:$x, intype:$y, intype:$z, intype:$lod)),
3785                  inst # " \t\\{$r, $g, $b, $a\\},"
3786                         " [$t, \\{$x, $y, $z, $z\\}], $lod;",
3787                  []>;
3788 multiclass TEX_UNIFIED_CUBE_LEVEL<string inst, NVPTXRegClass outtype,
3789                                   NVPTXRegClass intype> {
3790   def _R : TEX_UNIFIED_CUBE_LEVEL_base<inst, outtype, intype,
3791                                        (ins Int64Regs:$t)>;
3792   def _I : TEX_UNIFIED_CUBE_LEVEL_base<inst, outtype, intype,
3793                                        (ins i64imm:$t)>;
3796 defm TEX_UNIFIED_CUBE_F32_F32_LEVEL
3797   : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.f32.f32",
3798                            Float32Regs, Float32Regs>;
3799 defm TEX_UNIFIED_CUBE_S32_F32_LEVEL
3800   : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.s32.f32",
3801                            Int32Regs, Float32Regs>;
3802 defm TEX_UNIFIED_CUBE_U32_F32_LEVEL
3803   : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.u32.f32",
3804                            Int32Regs, Float32Regs>;
3806 class TEX_UNIFIED_CUBE_ARRAY_base<string inst, NVPTXRegClass outtype,
3807                                   NVPTXRegClass intype, dag tex>
3808     : NVPTXInst<(outs outtype:$r, outtype:$g,
3809                       outtype:$b, outtype:$a),
3810                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z)),
3811                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $z\\}];",
3812                  []>;
3813 multiclass TEX_UNIFIED_CUBE_ARRAY<string inst, NVPTXRegClass outtype,
3814                                   NVPTXRegClass intype> {
3815   def _R : TEX_UNIFIED_CUBE_ARRAY_base<inst, outtype, intype,
3816                                        (ins Int64Regs:$t)>;
3817   def _I : TEX_UNIFIED_CUBE_ARRAY_base<inst, outtype, intype,
3818                                        (ins i64imm:$t)>;
3821 defm TEX_UNIFIED_CUBE_ARRAY_F32_F32
3822   : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.f32.f32", Float32Regs, Float32Regs>;
3823 defm TEX_UNIFIED_CUBE_ARRAY_S32_F32
3824   : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.s32.f32", Int32Regs, Float32Regs>;
3825 defm TEX_UNIFIED_CUBE_ARRAY_U32_F32
3826   : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.u32.f32", Int32Regs, Float32Regs>;
3828 class TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
3829                                         NVPTXRegClass intype, dag tex>
3830     : NVPTXInst<(outs outtype:$r, outtype:$g,
3831                       outtype:$b, outtype:$a),
3832                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z,
3833                                 intype:$lod)),
3834                  inst # " \t\\{$r, $g, $b, $a\\},"
3835                         " [$t, \\{$l, $x, $y, $z\\}], $lod;",
3836                  []>;
3837 multiclass TEX_UNIFIED_CUBE_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
3838                                         NVPTXRegClass intype> {
3839   def _R : TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3840                                              (ins Int64Regs:$t)>;
3841   def _I : TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
3842                                              (ins i64imm:$t)>;
3845 defm TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
3846   : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.f32.f32",
3847                                  Float32Regs, Float32Regs>;
3848 defm TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
3849   : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.s32.f32",
3850                                  Int32Regs, Float32Regs>;
3851 defm TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
3852   : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.u32.f32",
3853                                  Int32Regs, Float32Regs>;
3855 class TEX_UNIFIED_CUBE_GRAD_base<string inst, NVPTXRegClass outtype,
3856                                  NVPTXRegClass intype, dag tex>
3857     : NVPTXInst<(outs outtype:$r, outtype:$g,
3858                       outtype:$b, outtype:$a),
3859                  !con(tex, (ins intype:$x, intype:$y, intype:$z,
3860                                 intype:$gradx0, intype:$gradx1,
3861                                 intype:$gradx2, intype:$grady0,
3862                                 intype:$grady1, intype:$grady2)),
3863                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}],"
3864                         " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
3865                         " \\{$grady0, $grady1, $grady2, $grady2\\};",
3866                  []>;
3868 multiclass TEX_UNIFIED_CUBE_GRAD<string inst, NVPTXRegClass outtype,
3869                                  NVPTXRegClass intype> {
3870   def _R : TEX_UNIFIED_CUBE_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3871   def _I : TEX_UNIFIED_CUBE_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
3874 defm TEX_UNIFIED_CUBE_F32_F32_GRAD
3875   : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.f32.f32", Float32Regs, Float32Regs>;
3876 defm TEX_UNIFIED_CUBE_S32_F32_GRAD
3877   : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.s32.f32", Int32Regs, Float32Regs>;
3878 defm TEX_UNIFIED_CUBE_U32_F32_GRAD
3879   : TEX_UNIFIED_CUBE_GRAD<"tex.grad.cube.v4.u32.f32", Int32Regs, Float32Regs>;
3881 class TEX_UNIFIED_CUBE_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
3882                                        NVPTXRegClass intype, dag tex>
3883     : NVPTXInst<(outs outtype:$r, outtype:$g,
3884                       outtype:$b, outtype:$a),
3885                  !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z,
3886                                 intype:$gradx0, intype:$gradx1,
3887                                 intype:$gradx2, intype:$grady0,
3888                                 intype:$grady1, intype:$grady2)),
3889                  inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $z\\}],"
3890                         " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
3891                         " \\{$grady0, $grady1, $grady2, $grady2\\};",
3892                  []>;
3893 multiclass TEX_UNIFIED_CUBE_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
3894                                        NVPTXRegClass intype> {
3895   def _R : TEX_UNIFIED_CUBE_ARRAY_GRAD_base<inst, outtype, intype,
3896                                             (ins Int64Regs:$t)>;
3897   def _I : TEX_UNIFIED_CUBE_ARRAY_GRAD_base<inst, outtype, intype,
3898                                             (ins i64imm:$t)>;
3901 defm TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD
3902   : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.f32.f32",
3903                                 Float32Regs, Float32Regs>;
3904 defm TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD
3905   : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.s32.f32",
3906                                 Int32Regs, Float32Regs>;
3907 defm TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD
3908   : TEX_UNIFIED_CUBE_ARRAY_GRAD<"tex.grad.acube.v4.u32.f32",
3909                                 Int32Regs, Float32Regs>;
3911 class TLD4_UNIFIED_2D_base<string inst, NVPTXRegClass outtype,
3912                            NVPTXRegClass intype, dag tex>
3913     : NVPTXInst<(outs outtype:$v0, outtype:$v1,
3914                       outtype:$v2, outtype:$v3),
3915                  !con(tex, (ins intype:$x, intype:$y)),
3916                  inst # " \t\\{$v0, $v1, $v2, $v3\\}, [$t, \\{$x, $y\\}];",
3917                  []>;
3918 multiclass TLD4_UNIFIED_2D<string inst, NVPTXRegClass outtype,
3919                            NVPTXRegClass intype> {
3920   def _R : TLD4_UNIFIED_2D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
3921   def _I : TLD4_UNIFIED_2D_base<inst, outtype, intype, (ins i64imm:$t)>;
3924 defm TLD4_UNIFIED_R_2D_F32_F32
3925   : TLD4_UNIFIED_2D<"tld4.r.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3926 defm TLD4_UNIFIED_G_2D_F32_F32
3927   : TLD4_UNIFIED_2D<"tld4.g.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3928 defm TLD4_UNIFIED_B_2D_F32_F32
3929   : TLD4_UNIFIED_2D<"tld4.b.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3930 defm TLD4_UNIFIED_A_2D_F32_F32
3931   : TLD4_UNIFIED_2D<"tld4.a.2d.v4.f32.f32", Float32Regs, Float32Regs>;
3933 defm TLD4_UNIFIED_R_2D_S32_F32
3934   : TLD4_UNIFIED_2D<"tld4.r.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3935 defm TLD4_UNIFIED_G_2D_S32_F32
3936   : TLD4_UNIFIED_2D<"tld4.g.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3937 defm TLD4_UNIFIED_B_2D_S32_F32
3938   : TLD4_UNIFIED_2D<"tld4.b.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3939 defm TLD4_UNIFIED_A_2D_S32_F32
3940   : TLD4_UNIFIED_2D<"tld4.a.2d.v4.s32.f32", Int32Regs, Float32Regs>;
3942 defm TLD4_UNIFIED_R_2D_U32_F32
3943   : TLD4_UNIFIED_2D<"tld4.r.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3944 defm TLD4_UNIFIED_G_2D_U32_F32
3945   : TLD4_UNIFIED_2D<"tld4.g.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3946 defm TLD4_UNIFIED_B_2D_U32_F32
3947   : TLD4_UNIFIED_2D<"tld4.b.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3948 defm TLD4_UNIFIED_A_2D_U32_F32
3949   : TLD4_UNIFIED_2D<"tld4.a.2d.v4.u32.f32", Int32Regs, Float32Regs>;
3955 //=== Surface load instructions
3957 let IsSuld = true in {
3959 class SULD_1D_base<string inst, NVPTXRegClass outtype, dag surf>
3960     : NVPTXInst<(outs outtype:$r),
3961                 !con(surf, (ins Int32Regs:$x)),
3962                 inst # " \\{$r\\}, [$s, \\{$x\\}];",
3963                 []>;
3964 multiclass SULD_1D<string inst, NVPTXRegClass outtype> {
3965   def _R : SULD_1D_base<inst, outtype, (ins Int64Regs:$s)>;
3966   def _I : SULD_1D_base<inst, outtype, (ins i64imm:$s)>;
3969 defm SULD_1D_I8_CLAMP : SULD_1D<"suld.b.1d.b8.clamp", Int16Regs>;
3970 defm SULD_1D_I16_CLAMP : SULD_1D<"suld.b.1d.b16.clamp", Int16Regs>;
3971 defm SULD_1D_I32_CLAMP : SULD_1D<"suld.b.1d.b32.clamp", Int32Regs>;
3972 defm SULD_1D_I64_CLAMP : SULD_1D<"suld.b.1d.b64.clamp", Int64Regs>;
3974 defm SULD_1D_I8_TRAP : SULD_1D<"suld.b.1d.b8.trap", Int16Regs>;
3975 defm SULD_1D_I16_TRAP : SULD_1D<"suld.b.1d.b16.trap", Int16Regs>;
3976 defm SULD_1D_I32_TRAP : SULD_1D<"suld.b.1d.b32.trap", Int32Regs>;
3977 defm SULD_1D_I64_TRAP : SULD_1D<"suld.b.1d.b64.trap", Int64Regs>;
3979 defm SULD_1D_I8_ZERO : SULD_1D<"suld.b.1d.b8.zero", Int16Regs>;
3980 defm SULD_1D_I16_ZERO : SULD_1D<"suld.b.1d.b16.zero", Int16Regs>;
3981 defm SULD_1D_I32_ZERO : SULD_1D<"suld.b.1d.b32.zero", Int32Regs>;
3982 defm SULD_1D_I64_ZERO : SULD_1D<"suld.b.1d.b64.zero", Int64Regs>;
3984 class SULD_1D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf>
3985     : NVPTXInst<(outs outtype:$r),
3986                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
3987                 inst # " \\{$r\\}, [$s, \\{$l, $x\\}];",
3988                 []>;
3989 multiclass SULD_1D_ARRAY<string inst, NVPTXRegClass outtype> {
3990   def _R : SULD_1D_ARRAY_base<inst, outtype, (ins Int64Regs:$s)>;
3991   def _I : SULD_1D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
3994 defm SULD_1D_ARRAY_I8_CLAMP
3995   : SULD_1D_ARRAY<"suld.b.a1d.b8.clamp", Int16Regs>;
3996 defm SULD_1D_ARRAY_I16_CLAMP
3997   : SULD_1D_ARRAY<"suld.b.a1d.b16.clamp", Int16Regs>;
3998 defm SULD_1D_ARRAY_I32_CLAMP
3999   : SULD_1D_ARRAY<"suld.b.a1d.b32.clamp", Int32Regs>;
4000 defm SULD_1D_ARRAY_I64_CLAMP
4001   : SULD_1D_ARRAY<"suld.b.a1d.b64.clamp", Int64Regs>;
4003 defm SULD_1D_ARRAY_I8_TRAP
4004   : SULD_1D_ARRAY<"suld.b.a1d.b8.trap", Int16Regs>;
4005 defm SULD_1D_ARRAY_I16_TRAP
4006   : SULD_1D_ARRAY<"suld.b.a1d.b16.trap", Int16Regs>;
4007 defm SULD_1D_ARRAY_I32_TRAP
4008   : SULD_1D_ARRAY<"suld.b.a1d.b32.trap", Int32Regs>;
4009 defm SULD_1D_ARRAY_I64_TRAP
4010   : SULD_1D_ARRAY<"suld.b.a1d.b64.trap", Int64Regs>;
4012 defm SULD_1D_ARRAY_I8_ZERO
4013   : SULD_1D_ARRAY<"suld.b.a1d.b8.zero", Int16Regs>;
4014 defm SULD_1D_ARRAY_I16_ZERO
4015   : SULD_1D_ARRAY<"suld.b.a1d.b16.zero", Int16Regs>;
4016 defm SULD_1D_ARRAY_I32_ZERO
4017   : SULD_1D_ARRAY<"suld.b.a1d.b32.zero", Int32Regs>;
4018 defm SULD_1D_ARRAY_I64_ZERO
4019   : SULD_1D_ARRAY<"suld.b.a1d.b64.zero", Int64Regs>;
4021 class SULD_2D_base<string inst, NVPTXRegClass outtype, dag surf>
4022     : NVPTXInst<(outs outtype:$r),
4023                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
4024                 inst # " \\{$r\\}, [$s, \\{$x, $y\\}];",
4025                 []>;
4026 multiclass SULD_2D<string inst, NVPTXRegClass outtype> {
4027   def _R : SULD_2D_base<inst, outtype, (ins Int64Regs:$s)>;
4028   def _I : SULD_2D_base<inst, outtype, (ins i64imm:$s)>;
4031 defm SULD_2D_I8_CLAMP : SULD_2D<"suld.b.2d.b8.clamp", Int16Regs>;
4032 defm SULD_2D_I16_CLAMP : SULD_2D<"suld.b.2d.b16.clamp", Int16Regs>;
4033 defm SULD_2D_I32_CLAMP : SULD_2D<"suld.b.2d.b32.clamp", Int32Regs>;
4034 defm SULD_2D_I64_CLAMP : SULD_2D<"suld.b.2d.b64.clamp", Int64Regs>;
4036 defm SULD_2D_I8_TRAP : SULD_2D<"suld.b.2d.b8.trap", Int16Regs>;
4037 defm SULD_2D_I16_TRAP : SULD_2D<"suld.b.2d.b16.trap", Int16Regs>;
4038 defm SULD_2D_I32_TRAP : SULD_2D<"suld.b.2d.b32.trap", Int32Regs>;
4039 defm SULD_2D_I64_TRAP : SULD_2D<"suld.b.2d.b64.trap", Int64Regs>;
4041 defm SULD_2D_I8_ZERO : SULD_2D<"suld.b.2d.b8.zero", Int16Regs>;
4042 defm SULD_2D_I16_ZERO : SULD_2D<"suld.b.2d.b16.zero", Int16Regs>;
4043 defm SULD_2D_I32_ZERO : SULD_2D<"suld.b.2d.b32.zero", Int32Regs>;
4044 defm SULD_2D_I64_ZERO : SULD_2D<"suld.b.2d.b64.zero", Int64Regs>;
4046 class SULD_2D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf>
4047     : NVPTXInst<(outs outtype:$r),
4048                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
4049                 inst # " \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
4050                 []>;
4051 multiclass SULD_2D_ARRAY<string inst, NVPTXRegClass outtype> {
4052   def _R : SULD_2D_ARRAY_base<inst, outtype, (ins Int64Regs:$s)>;
4053   def _I : SULD_2D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
4056 defm SULD_2D_ARRAY_I8_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b8.clamp", Int16Regs>;
4057 defm SULD_2D_ARRAY_I16_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b16.clamp", Int16Regs>;
4058 defm SULD_2D_ARRAY_I32_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b32.clamp", Int32Regs>;
4059 defm SULD_2D_ARRAY_I64_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b64.clamp", Int64Regs>;
4061 defm SULD_2D_ARRAY_I8_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b8.trap", Int16Regs>;
4062 defm SULD_2D_ARRAY_I16_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b16.trap", Int16Regs>;
4063 defm SULD_2D_ARRAY_I32_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b32.trap", Int32Regs>;
4064 defm SULD_2D_ARRAY_I64_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b64.trap", Int64Regs>;
4066 defm SULD_2D_ARRAY_I8_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b8.zero", Int16Regs>;
4067 defm SULD_2D_ARRAY_I16_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b16.zero", Int16Regs>;
4068 defm SULD_2D_ARRAY_I32_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b32.zero", Int32Regs>;
4069 defm SULD_2D_ARRAY_I64_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b64.zero", Int64Regs>;
4071 class SULD_3D_base<string inst, NVPTXRegClass outtype, dag surf>
4072     : NVPTXInst<(outs outtype:$r),
4073                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
4074                 inst # " \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
4075                 []>;
4076 multiclass SULD_3D<string inst, NVPTXRegClass outtype> {
4077   def _R : SULD_3D_base<inst, outtype, (ins Int64Regs:$s)>;
4078   def _I : SULD_3D_base<inst, outtype, (ins i64imm:$s)>;
4081 defm SULD_3D_I8_CLAMP : SULD_3D<"suld.b.3d.b8.clamp", Int16Regs>;
4082 defm SULD_3D_I16_CLAMP : SULD_3D<"suld.b.3d.b16.clamp", Int16Regs>;
4083 defm SULD_3D_I32_CLAMP : SULD_3D<"suld.b.3d.b32.clamp", Int32Regs>;
4084 defm SULD_3D_I64_CLAMP : SULD_3D<"suld.b.3d.b64.clamp", Int64Regs>;
4086 defm SULD_3D_I8_TRAP : SULD_3D<"suld.b.3d.b8.trap", Int16Regs>;
4087 defm SULD_3D_I16_TRAP : SULD_3D<"suld.b.3d.b16.trap", Int16Regs>;
4088 defm SULD_3D_I32_TRAP : SULD_3D<"suld.b.3d.b32.trap", Int32Regs>;
4089 defm SULD_3D_I64_TRAP : SULD_3D<"suld.b.3d.b64.trap", Int64Regs>;
4091 defm SULD_3D_I8_ZERO : SULD_3D<"suld.b.3d.b8.zero", Int16Regs>;
4092 defm SULD_3D_I16_ZERO : SULD_3D<"suld.b.3d.b16.zero", Int16Regs>;
4093 defm SULD_3D_I32_ZERO : SULD_3D<"suld.b.3d.b32.zero", Int32Regs>;
4094 defm SULD_3D_I64_ZERO : SULD_3D<"suld.b.3d.b64.zero", Int64Regs>;
4097 let IsSuld = 2 in {
4099 class SULD_1D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
4100     : NVPTXInst<(outs outtype:$r, outtype:$g),
4101                 !con(surf, (ins Int32Regs:$x)),
4102                 inst # " \\{$r, $g\\}, [$s, \\{$x\\}];",
4103                 []>;
4104 multiclass SULD_1D_V2<string inst, NVPTXRegClass outtype> {
4105   def _R : SULD_1D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
4106   def _I : SULD_1D_V2_base<inst, outtype, (ins i64imm:$s)>;
4109 defm SULD_1D_V2I8_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b8.clamp", Int16Regs>;
4110 defm SULD_1D_V2I16_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b16.clamp", Int16Regs>;
4111 defm SULD_1D_V2I32_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b32.clamp", Int32Regs>;
4112 defm SULD_1D_V2I64_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b64.clamp", Int64Regs>;
4114 defm SULD_1D_V2I8_TRAP : SULD_1D_V2<"suld.b.1d.v2.b8.trap", Int16Regs>;
4115 defm SULD_1D_V2I16_TRAP : SULD_1D_V2<"suld.b.1d.v2.b16.trap", Int16Regs>;
4116 defm SULD_1D_V2I32_TRAP : SULD_1D_V2<"suld.b.1d.v2.b32.trap", Int32Regs>;
4117 defm SULD_1D_V2I64_TRAP : SULD_1D_V2<"suld.b.1d.v2.b64.trap", Int64Regs>;
4119 defm SULD_1D_V2I8_ZERO : SULD_1D_V2<"suld.b.1d.v2.b8.zero", Int16Regs>;
4120 defm SULD_1D_V2I16_ZERO : SULD_1D_V2<"suld.b.1d.v2.b16.zero", Int16Regs>;
4121 defm SULD_1D_V2I32_ZERO : SULD_1D_V2<"suld.b.1d.v2.b32.zero", Int32Regs>;
4122 defm SULD_1D_V2I64_ZERO : SULD_1D_V2<"suld.b.1d.v2.b64.zero", Int64Regs>;
4124 class SULD_1D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf>
4125     : NVPTXInst<(outs outtype:$r, outtype:$g),
4126                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
4127                 inst # " \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
4128                 []>;
4129 multiclass SULD_1D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
4130   def _R : SULD_1D_ARRAY_V2_base<inst, outtype, (ins Int64Regs:$s)>;
4131   def _I : SULD_1D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
4134 defm SULD_1D_ARRAY_V2I8_CLAMP
4135   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.clamp", Int16Regs>;
4136 defm SULD_1D_ARRAY_V2I16_CLAMP
4137   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.clamp", Int16Regs>;
4138 defm SULD_1D_ARRAY_V2I32_CLAMP
4139   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.clamp", Int32Regs>;
4140 defm SULD_1D_ARRAY_V2I64_CLAMP
4141   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.clamp", Int64Regs>;
4143 defm SULD_1D_ARRAY_V2I8_TRAP
4144   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.trap", Int16Regs>;
4145 defm SULD_1D_ARRAY_V2I16_TRAP
4146   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.trap", Int16Regs>;
4147 defm SULD_1D_ARRAY_V2I32_TRAP
4148   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.trap", Int32Regs>;
4149 defm SULD_1D_ARRAY_V2I64_TRAP
4150   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.trap", Int64Regs>;
4152 defm SULD_1D_ARRAY_V2I8_ZERO
4153   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.zero", Int16Regs>;
4154 defm SULD_1D_ARRAY_V2I16_ZERO
4155   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.zero", Int16Regs>;
4156 defm SULD_1D_ARRAY_V2I32_ZERO
4157   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.zero", Int32Regs>;
4158 defm SULD_1D_ARRAY_V2I64_ZERO
4159   : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.zero", Int64Regs>;
4161 class SULD_2D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
4162     : NVPTXInst<(outs outtype:$r, outtype:$g),
4163                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
4164                 inst # " \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
4165                 []>;
4166 multiclass SULD_2D_V2<string inst, NVPTXRegClass outtype> {
4167   def _R : SULD_2D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
4168   def _I : SULD_2D_V2_base<inst, outtype, (ins i64imm:$s)>;
4171 defm SULD_2D_V2I8_CLAMP
4172   : SULD_2D_V2<"suld.b.2d.v2.b8.clamp", Int16Regs>;
4173 defm SULD_2D_V2I16_CLAMP
4174   : SULD_2D_V2<"suld.b.2d.v2.b16.clamp", Int16Regs>;
4175 defm SULD_2D_V2I32_CLAMP
4176   : SULD_2D_V2<"suld.b.2d.v2.b32.clamp", Int32Regs>;
4177 defm SULD_2D_V2I64_CLAMP
4178   : SULD_2D_V2<"suld.b.2d.v2.b64.clamp", Int64Regs>;
4180 defm SULD_2D_V2I8_TRAP
4181   : SULD_2D_V2<"suld.b.2d.v2.b8.trap", Int16Regs>;
4182 defm SULD_2D_V2I16_TRAP
4183   : SULD_2D_V2<"suld.b.2d.v2.b16.trap", Int16Regs>;
4184 defm SULD_2D_V2I32_TRAP
4185   : SULD_2D_V2<"suld.b.2d.v2.b32.trap", Int32Regs>;
4186 defm SULD_2D_V2I64_TRAP
4187   : SULD_2D_V2<"suld.b.2d.v2.b64.trap", Int64Regs>;
4189 defm SULD_2D_V2I8_ZERO
4190   : SULD_2D_V2<"suld.b.2d.v2.b8.zero", Int16Regs>;
4191 defm SULD_2D_V2I16_ZERO
4192   : SULD_2D_V2<"suld.b.2d.v2.b16.zero", Int16Regs>;
4193 defm SULD_2D_V2I32_ZERO
4194   : SULD_2D_V2<"suld.b.2d.v2.b32.zero", Int32Regs>;
4195 defm SULD_2D_V2I64_ZERO
4196   : SULD_2D_V2<"suld.b.2d.v2.b64.zero", Int64Regs>;
4198 class SULD_2D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf>
4199     : NVPTXInst<(outs outtype:$r, outtype:$g),
4200                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
4201                 inst # " \\{$r, $g\\}, [$s, \\{$l, $x, $y, $y\\}];",
4202                 []>;
4203 multiclass SULD_2D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
4204   def _R : SULD_2D_ARRAY_V2_base<inst, outtype, (ins Int64Regs:$s)>;
4205   def _I : SULD_2D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
4208 defm SULD_2D_ARRAY_V2I8_CLAMP
4209   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.clamp", Int16Regs>;
4210 defm SULD_2D_ARRAY_V2I16_CLAMP
4211   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.clamp", Int16Regs>;
4212 defm SULD_2D_ARRAY_V2I32_CLAMP
4213   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.clamp", Int32Regs>;
4214 defm SULD_2D_ARRAY_V2I64_CLAMP
4215   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.clamp", Int64Regs>;
4217 defm SULD_2D_ARRAY_V2I8_TRAP
4218   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.trap", Int16Regs>;
4219 defm SULD_2D_ARRAY_V2I16_TRAP
4220   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.trap", Int16Regs>;
4221 defm SULD_2D_ARRAY_V2I32_TRAP
4222   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.trap", Int32Regs>;
4223 defm SULD_2D_ARRAY_V2I64_TRAP
4224   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.trap", Int64Regs>;
4226 defm SULD_2D_ARRAY_V2I8_ZERO
4227   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.zero", Int16Regs>;
4228 defm SULD_2D_ARRAY_V2I16_ZERO
4229   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.zero", Int16Regs>;
4230 defm SULD_2D_ARRAY_V2I32_ZERO
4231   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.zero", Int32Regs>;
4232 defm SULD_2D_ARRAY_V2I64_ZERO
4233   : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.zero", Int64Regs>;
4235 class SULD_3D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
4236     : NVPTXInst<(outs outtype:$r, outtype:$g),
4237                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
4238                 inst # " \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
4239                 []>;
4240 multiclass SULD_3D_V2<string inst, NVPTXRegClass outtype> {
4241   def _R : SULD_3D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
4242   def _I : SULD_3D_V2_base<inst, outtype, (ins i64imm:$s)>;
4245 defm SULD_3D_V2I8_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b8.clamp", Int16Regs>;
4246 defm SULD_3D_V2I16_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b16.clamp", Int16Regs>;
4247 defm SULD_3D_V2I32_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b32.clamp", Int32Regs>;
4248 defm SULD_3D_V2I64_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b64.clamp", Int64Regs>;
4250 defm SULD_3D_V2I8_TRAP : SULD_3D_V2<"suld.b.3d.v2.b8.trap", Int16Regs>;
4251 defm SULD_3D_V2I16_TRAP : SULD_3D_V2<"suld.b.3d.v2.b16.trap", Int16Regs>;
4252 defm SULD_3D_V2I32_TRAP : SULD_3D_V2<"suld.b.3d.v2.b32.trap", Int32Regs>;
4253 defm SULD_3D_V2I64_TRAP : SULD_3D_V2<"suld.b.3d.v2.b64.trap", Int64Regs>;
4255 defm SULD_3D_V2I8_ZERO : SULD_3D_V2<"suld.b.3d.v2.b8.zero", Int16Regs>;
4256 defm SULD_3D_V2I16_ZERO : SULD_3D_V2<"suld.b.3d.v2.b16.zero", Int16Regs>;
4257 defm SULD_3D_V2I32_ZERO : SULD_3D_V2<"suld.b.3d.v2.b32.zero", Int32Regs>;
4258 defm SULD_3D_V2I64_ZERO : SULD_3D_V2<"suld.b.3d.v2.b64.zero", Int64Regs>;
4262 let IsSuld = 3 in {
4264 class SULD_1D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
4265     : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
4266                 !con(surf, (ins Int32Regs:$x)),
4267                 inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
4268                 []>;
4269 multiclass SULD_1D_V4<string inst, NVPTXRegClass outtype> {
4270   def _R : SULD_1D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
4271   def _I : SULD_1D_V4_base<inst, outtype, (ins i64imm:$s)>;
4274 defm SULD_1D_V4I8_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b8.clamp", Int16Regs>;
4275 defm SULD_1D_V4I16_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b16.clamp", Int16Regs>;
4276 defm SULD_1D_V4I32_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b32.clamp", Int32Regs>;
4278 defm SULD_1D_V4I8_TRAP : SULD_1D_V4<"suld.b.1d.v4.b8.trap", Int16Regs>;
4279 defm SULD_1D_V4I16_TRAP : SULD_1D_V4<"suld.b.1d.v4.b16.trap", Int16Regs>;
4280 defm SULD_1D_V4I32_TRAP : SULD_1D_V4<"suld.b.1d.v4.b32.trap", Int32Regs>;
4282 defm SULD_1D_V4I8_ZERO : SULD_1D_V4<"suld.b.1d.v4.b8.zero", Int16Regs>;
4283 defm SULD_1D_V4I16_ZERO : SULD_1D_V4<"suld.b.1d.v4.b16.zero", Int16Regs>;
4284 defm SULD_1D_V4I32_ZERO : SULD_1D_V4<"suld.b.1d.v4.b32.zero", Int32Regs>;
4286 class SULD_1D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf>
4287     : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
4288                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
4289                 inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$l, $x\\}];",
4290                 []>;
4291 multiclass SULD_1D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
4292   def _R : SULD_1D_ARRAY_V4_base<inst, outtype, (ins Int64Regs:$s)>;
4293   def _I : SULD_1D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
4296 defm SULD_1D_ARRAY_V4I8_CLAMP
4297   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.clamp", Int16Regs>;
4298 defm SULD_1D_ARRAY_V4I16_CLAMP
4299   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.clamp", Int16Regs>;
4300 defm SULD_1D_ARRAY_V4I32_CLAMP
4301   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.clamp", Int32Regs>;
4303 defm SULD_1D_ARRAY_V4I8_TRAP
4304   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.trap", Int16Regs>;
4305 defm SULD_1D_ARRAY_V4I16_TRAP
4306   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.trap", Int16Regs>;
4307 defm SULD_1D_ARRAY_V4I32_TRAP
4308   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.trap", Int32Regs>;
4310 defm SULD_1D_ARRAY_V4I8_ZERO
4311   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.zero", Int16Regs>;
4312 defm SULD_1D_ARRAY_V4I16_ZERO
4313   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.zero", Int16Regs>;
4314 defm SULD_1D_ARRAY_V4I32_ZERO
4315   : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.zero", Int32Regs>;
4317 class SULD_2D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
4318     : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
4319                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
4320                 inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
4321                 []>;
4322 multiclass SULD_2D_V4<string inst, NVPTXRegClass outtype> {
4323   def _R : SULD_2D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
4324   def _I : SULD_2D_V4_base<inst, outtype, (ins i64imm:$s)>;
4327 defm SULD_2D_V4I8_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b8.clamp", Int16Regs>;
4328 defm SULD_2D_V4I16_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b16.clamp", Int16Regs>;
4329 defm SULD_2D_V4I32_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b32.clamp", Int32Regs>;
4331 defm SULD_2D_V4I8_TRAP : SULD_2D_V4<"suld.b.2d.v4.b8.trap", Int16Regs>;
4332 defm SULD_2D_V4I16_TRAP : SULD_2D_V4<"suld.b.2d.v4.b16.trap", Int16Regs>;
4333 defm SULD_2D_V4I32_TRAP : SULD_2D_V4<"suld.b.2d.v4.b32.trap", Int32Regs>;
4335 defm SULD_2D_V4I8_ZERO : SULD_2D_V4<"suld.b.2d.v4.b8.zero", Int16Regs>;
4336 defm SULD_2D_V4I16_ZERO : SULD_2D_V4<"suld.b.2d.v4.b16.zero", Int16Regs>;
4337 defm SULD_2D_V4I32_ZERO : SULD_2D_V4<"suld.b.2d.v4.b32.zero", Int32Regs>;
4339 class SULD_2D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf>
4340     : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
4341                 !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
4342                 inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$l, $x, $y, $y\\}];",
4343                 []>;
4344 multiclass SULD_2D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
4345   def _R : SULD_2D_ARRAY_V4_base<inst, outtype, (ins Int64Regs:$s)>;
4346   def _I : SULD_2D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
4349 defm SULD_2D_ARRAY_V4I8_CLAMP
4350   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.clamp", Int16Regs>;
4351 defm SULD_2D_ARRAY_V4I16_CLAMP
4352   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.clamp", Int16Regs>;
4353 defm SULD_2D_ARRAY_V4I32_CLAMP
4354   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.clamp", Int32Regs>;
4356 defm SULD_2D_ARRAY_V4I8_TRAP
4357   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.trap", Int16Regs>;
4358 defm SULD_2D_ARRAY_V4I16_TRAP
4359   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.trap", Int16Regs>;
4360 defm SULD_2D_ARRAY_V4I32_TRAP
4361   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.trap", Int32Regs>;
4363 defm SULD_2D_ARRAY_V4I8_ZERO
4364   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.zero", Int16Regs>;
4365 defm SULD_2D_ARRAY_V4I16_ZERO
4366   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.zero", Int16Regs>;
4367 defm SULD_2D_ARRAY_V4I32_ZERO
4368   : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.zero", Int32Regs>;
4370 class SULD_3D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
4371     : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
4372                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
4373                 inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y, $z, $z\\}];",
4374                 []>;
4375 multiclass SULD_3D_V4<string inst, NVPTXRegClass outtype> {
4376   def _R : SULD_3D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
4377   def _I : SULD_3D_V4_base<inst, outtype, (ins i64imm:$s)>;
4380 defm SULD_3D_V4I8_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b8.clamp", Int16Regs>;
4381 defm SULD_3D_V4I16_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b16.clamp", Int16Regs>;
4382 defm SULD_3D_V4I32_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b32.clamp", Int32Regs>;
4384 defm SULD_3D_V4I8_TRAP : SULD_3D_V4<"suld.b.3d.v4.b8.trap", Int16Regs>;
4385 defm SULD_3D_V4I16_TRAP : SULD_3D_V4<"suld.b.3d.v4.b16.trap", Int16Regs>;
4386 defm SULD_3D_V4I32_TRAP : SULD_3D_V4<"suld.b.3d.v4.b32.trap", Int32Regs>;
4388 defm SULD_3D_V4I8_ZERO : SULD_3D_V4<"suld.b.3d.v4.b8.zero", Int16Regs>;
4389 defm SULD_3D_V4I16_ZERO : SULD_3D_V4<"suld.b.3d.v4.b16.zero", Int16Regs>;
4390 defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>;
4394 //-----------------------------------
4395 // Texture Query Intrinsics
4396 //-----------------------------------
4398 let IsSurfTexQuery = true in {
4399 def TXQ_CHANNEL_ORDER_R
4400   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4401               "txq.channel_order.b32 \t$d, [$a];",
4402               []>;
4403 def TXQ_CHANNEL_ORDER_I
4404   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4405               "txq.channel_order.b32 \t$d, [$a];",
4406               []>;
4407 def TXQ_CHANNEL_DATA_TYPE_R
4408   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4409               "txq.channel_data_type.b32 \t$d, [$a];",
4410               []>;
4411 def TXQ_CHANNEL_DATA_TYPE_I
4412   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4413               "txq.channel_data_type.b32 \t$d, [$a];",
4414               []>;
4415 def TXQ_WIDTH_R
4416   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4417               "txq.width.b32 \t$d, [$a];",
4418               []>;
4419 def TXQ_WIDTH_I
4420   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4421               "txq.width.b32 \t$d, [$a];",
4422               []>;
4423 def TXQ_HEIGHT_R
4424   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4425               "txq.height.b32 \t$d, [$a];",
4426               []>;
4427 def TXQ_HEIGHT_I
4428   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4429               "txq.height.b32 \t$d, [$a];",
4430               []>;
4431 def TXQ_DEPTH_R
4432   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4433               "txq.depth.b32 \t$d, [$a];",
4434               []>;
4435 def TXQ_DEPTH_I
4436   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4437               "txq.depth.b32 \t$d, [$a];",
4438               []>;
4439 def TXQ_ARRAY_SIZE_R
4440   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4441               "txq.array_size.b32 \t$d, [$a];",
4442               []>;
4443 def TXQ_ARRAY_SIZE_I
4444   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4445               "txq.array_size.b32 \t$d, [$a];",
4446               []>;
4447 def TXQ_NUM_SAMPLES_R
4448   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4449               "txq.num_samples.b32 \t$d, [$a];",
4450               []>;
4451 def TXQ_NUM_SAMPLES_I
4452   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4453               "txq.num_samples.b32 \t$d, [$a];",
4454               []>;
4455 def TXQ_NUM_MIPMAP_LEVELS_R
4456   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4457               "txq.num_mipmap_levels.b32 \t$d, [$a];",
4458               []>;
4459 def TXQ_NUM_MIPMAP_LEVELS_I
4460   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4461               "txq.num_mipmap_levels.b32 \t$d, [$a];",
4462               []>;
4465 def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
4466           (TXQ_CHANNEL_ORDER_R Int64Regs:$a)>;
4467 def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
4468           (TXQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
4469 def : Pat<(int_nvvm_txq_width Int64Regs:$a),
4470           (TXQ_WIDTH_R Int64Regs:$a)>;
4471 def : Pat<(int_nvvm_txq_height Int64Regs:$a),
4472           (TXQ_HEIGHT_R Int64Regs:$a)>;
4473 def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
4474           (TXQ_DEPTH_R Int64Regs:$a)>;
4475 def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
4476           (TXQ_ARRAY_SIZE_R Int64Regs:$a)>;
4477 def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
4478           (TXQ_NUM_SAMPLES_R Int64Regs:$a)>;
4479 def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
4480           (TXQ_NUM_MIPMAP_LEVELS_R Int64Regs:$a)>;
4483 //-----------------------------------
4484 // Surface Query Intrinsics
4485 //-----------------------------------
4487 let IsSurfTexQuery = true in {
4488 def SUQ_CHANNEL_ORDER_R
4489   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4490               "suq.channel_order.b32 \t$d, [$a];",
4491               []>;
4492 def SUQ_CHANNEL_ORDER_I
4493   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4494               "suq.channel_order.b32 \t$d, [$a];",
4495               []>;
4496 def SUQ_CHANNEL_DATA_TYPE_R
4497   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4498               "suq.channel_data_type.b32 \t$d, [$a];",
4499               []>;
4500 def SUQ_CHANNEL_DATA_TYPE_I
4501   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4502               "suq.channel_data_type.b32 \t$d, [$a];",
4503               []>;
4504 def SUQ_WIDTH_R
4505   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4506               "suq.width.b32 \t$d, [$a];",
4507               []>;
4508 def SUQ_WIDTH_I
4509   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4510               "suq.width.b32 \t$d, [$a];",
4511               []>;
4512 def SUQ_HEIGHT_R
4513   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4514               "suq.height.b32 \t$d, [$a];",
4515               []>;
4516 def SUQ_HEIGHT_I
4517   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4518               "suq.height.b32 \t$d, [$a];",
4519               []>;
4520 def SUQ_DEPTH_R
4521   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4522               "suq.depth.b32 \t$d, [$a];",
4523               []>;
4524 def SUQ_DEPTH_I
4525   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4526               "suq.depth.b32 \t$d, [$a];",
4527               []>;
4528 def SUQ_ARRAY_SIZE_R
4529   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
4530               "suq.array_size.b32 \t$d, [$a];",
4531               []>;
4532 def SUQ_ARRAY_SIZE_I
4533   : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
4534               "suq.array_size.b32 \t$d, [$a];",
4535               []>;
4538 def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
4539           (SUQ_CHANNEL_ORDER_R Int64Regs:$a)>;
4540 def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
4541           (SUQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
4542 def : Pat<(int_nvvm_suq_width Int64Regs:$a),
4543           (SUQ_WIDTH_R Int64Regs:$a)>;
4544 def : Pat<(int_nvvm_suq_height Int64Regs:$a),
4545           (SUQ_HEIGHT_R Int64Regs:$a)>;
4546 def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
4547           (SUQ_DEPTH_R Int64Regs:$a)>;
4548 def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
4549           (SUQ_ARRAY_SIZE_R Int64Regs:$a)>;
4552 //===- Handle Query -------------------------------------------------------===//
4554 // TODO: These intrinsics are not yet finalized, pending PTX ISA design work
4555 def ISTYPEP_SAMPLER
4556   : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
4557               "istypep.samplerref \t$d, $a;",
4558               [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
4559 def ISTYPEP_SURFACE
4560   : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
4561               "istypep.surfref \t$d, $a;",
4562               [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
4563 def ISTYPEP_TEXTURE
4564   : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
4565               "istypep.texref \t$d, $a;",
4566               [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
4568 //===- Surface Stores -----------------------------------------------------===//
4570 let IsSust = true in {
4572 class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf>
4573     : NVPTXInst<(outs),
4574                 !con(surf, (ins Int32Regs:$x, intype:$r)),
4575                 inst # " \t[$s, \\{$x\\}], \\{$r\\};",
4576                 []>;
4577 multiclass SUST_1D<string inst, NVPTXRegClass intype> {
4578   def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s)>;
4579   def _I : SUST_1D_base<inst, intype, (ins i64imm:$s)>;
4582 defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
4583 defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
4584 defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
4585 defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
4587 defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
4588 defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
4589 defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
4590 defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
4592 defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
4593 defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
4594 defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
4595 defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
4597 defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
4598 defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
4599 defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
4601 class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf>
4602     : NVPTXInst<(outs),
4603                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)),
4604                 inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};",
4605                 []>;
4606 multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
4607   def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s)>;
4608   def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s)>;
4611 defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
4612 defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
4613 defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
4614 defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
4616 defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
4617 defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
4618 defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
4619 defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
4621 defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
4622 defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
4623 defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
4624 defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
4626 defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
4627 defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
4628 defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
4630 class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf>
4631     : NVPTXInst<(outs),
4632                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g,
4633                                 intype:$b, intype:$a)),
4634                 inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
4635                 []>;
4636 multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
4637   def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s)>;
4638   def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s)>;
4641 defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
4642 defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
4643 defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
4645 defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
4646 defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
4647 defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
4649 defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
4650 defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
4651 defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
4653 defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
4654 defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
4655 defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
4657 class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
4658     : NVPTXInst<(outs),
4659                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)),
4660                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};",
4661                 []>;
4662 multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
4663   def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
4664   def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
4667 defm SUST_B_1D_ARRAY_B8_CLAMP
4668   : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>;
4669 defm SUST_B_1D_ARRAY_B16_CLAMP
4670   : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>;
4671 defm SUST_B_1D_ARRAY_B32_CLAMP
4672   : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>;
4673 defm SUST_B_1D_ARRAY_B64_CLAMP
4674   : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>;
4676 defm SUST_B_1D_ARRAY_B8_TRAP
4677   : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>;
4678 defm SUST_B_1D_ARRAY_B16_TRAP
4679   : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>;
4680 defm SUST_B_1D_ARRAY_B32_TRAP
4681   : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>;
4682 defm SUST_B_1D_ARRAY_B64_TRAP
4683   : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>;
4685 defm SUST_B_1D_ARRAY_B8_ZERO
4686   : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>;
4687 defm SUST_B_1D_ARRAY_B16_ZERO
4688   : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>;
4689 defm SUST_B_1D_ARRAY_B32_ZERO
4690   : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>;
4691 defm SUST_B_1D_ARRAY_B64_ZERO
4692   : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>;
4694 defm SUST_P_1D_ARRAY_B8_TRAP
4695   : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>;
4696 defm SUST_P_1D_ARRAY_B16_TRAP
4697   : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>;
4698 defm SUST_P_1D_ARRAY_B32_TRAP
4699   : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>;
4701 class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
4702     : NVPTXInst<(outs),
4703                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
4704                                 intype:$r, intype:$g)),
4705                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
4706                 []>;
4707 multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
4708   def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
4709   def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
4712 defm SUST_B_1D_ARRAY_V2B8_CLAMP
4713   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>;
4714 defm SUST_B_1D_ARRAY_V2B16_CLAMP
4715   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>;
4716 defm SUST_B_1D_ARRAY_V2B32_CLAMP
4717   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>;
4718 defm SUST_B_1D_ARRAY_V2B64_CLAMP
4719   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>;
4721 defm SUST_B_1D_ARRAY_V2B8_TRAP
4722   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>;
4723 defm SUST_B_1D_ARRAY_V2B16_TRAP
4724   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>;
4725 defm SUST_B_1D_ARRAY_V2B32_TRAP
4726   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>;
4727 defm SUST_B_1D_ARRAY_V2B64_TRAP
4728   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>;
4730 defm SUST_B_1D_ARRAY_V2B8_ZERO
4731   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>;
4732 defm SUST_B_1D_ARRAY_V2B16_ZERO
4733   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>;
4734 defm SUST_B_1D_ARRAY_V2B32_ZERO
4735   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>;
4736 defm SUST_B_1D_ARRAY_V2B64_ZERO
4737   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>;
4739 defm SUST_P_1D_ARRAY_V2B8_TRAP
4740   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>;
4741 defm SUST_P_1D_ARRAY_V2B16_TRAP
4742   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>;
4743 defm SUST_P_1D_ARRAY_V2B32_TRAP
4744   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>;
4746 class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
4747     : NVPTXInst<(outs),
4748                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
4749                                 intype:$r, intype:$g, intype:$b, intype:$a)),
4750                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};",
4751                 []>;
4752 multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
4753   def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
4754   def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
4757 defm SUST_B_1D_ARRAY_V4B8_CLAMP
4758   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>;
4759 defm SUST_B_1D_ARRAY_V4B16_CLAMP
4760   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>;
4761 defm SUST_B_1D_ARRAY_V4B32_CLAMP
4762   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>;
4764 defm SUST_B_1D_ARRAY_V4B8_TRAP
4765   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>;
4766 defm SUST_B_1D_ARRAY_V4B16_TRAP
4767   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>;
4768 defm SUST_B_1D_ARRAY_V4B32_TRAP
4769   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>;
4771 defm SUST_B_1D_ARRAY_V4B8_ZERO
4772   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>;
4773 defm SUST_B_1D_ARRAY_V4B16_ZERO
4774   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>;
4775 defm SUST_B_1D_ARRAY_V4B32_ZERO
4776   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>;
4778 defm SUST_P_1D_ARRAY_V4B8_TRAP
4779   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>;
4780 defm SUST_P_1D_ARRAY_V4B16_TRAP
4781   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>;
4782 defm SUST_P_1D_ARRAY_V4B32_TRAP
4783   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>;
4785 class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf>
4786     : NVPTXInst<(outs),
4787                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)),
4788                 inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};",
4789                 []>;
4790 multiclass SUST_2D<string inst, NVPTXRegClass intype> {
4791   def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s)>;
4792   def _I : SUST_2D_base<inst, intype, (ins i64imm:$s)>;
4795 defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
4796 defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
4797 defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
4798 defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
4800 defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
4801 defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
4802 defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
4803 defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
4805 defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
4806 defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
4807 defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
4808 defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
4810 defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
4811 defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
4812 defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
4814 class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf>
4815     : NVPTXInst<(outs),
4816                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
4817                                 intype:$r, intype:$g)),
4818                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
4819                 []>;
4820 multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
4821   def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s)>;
4822   def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s)>;
4825 defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
4826 defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
4827 defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
4828 defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
4830 defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
4831 defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
4832 defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
4833 defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
4835 defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
4836 defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
4837 defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
4838 defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
4840 defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
4841 defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
4842 defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
4844 class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf>
4845     : NVPTXInst<(outs),
4846                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
4847                                 intype:$r, intype:$g, intype:$b, intype:$a)),
4848                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};",
4849                 []>;
4850 multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
4851   def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s)>;
4852   def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s)>;
4855 defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
4856 defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
4857 defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
4859 defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
4860 defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
4861 defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
4863 defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
4864 defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
4865 defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
4867 defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
4868 defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
4869 defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
4871 class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
4872     : NVPTXInst<(outs),
4873                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
4874                                 intype:$r)),
4875                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
4876                 []>;
4877 multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
4878   def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
4879   def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
4882 defm SUST_B_2D_ARRAY_B8_CLAMP
4883   : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>;
4884 defm SUST_B_2D_ARRAY_B16_CLAMP
4885   : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>;
4886 defm SUST_B_2D_ARRAY_B32_CLAMP
4887   : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>;
4888 defm SUST_B_2D_ARRAY_B64_CLAMP
4889   : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>;
4891 defm SUST_B_2D_ARRAY_B8_TRAP
4892   : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>;
4893 defm SUST_B_2D_ARRAY_B16_TRAP
4894   : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>;
4895 defm SUST_B_2D_ARRAY_B32_TRAP
4896   : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>;
4897 defm SUST_B_2D_ARRAY_B64_TRAP
4898   : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>;
4900 defm SUST_B_2D_ARRAY_B8_ZERO
4901   : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>;
4902 defm SUST_B_2D_ARRAY_B16_ZERO
4903   : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>;
4904 defm SUST_B_2D_ARRAY_B32_ZERO
4905   : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>;
4906 defm SUST_B_2D_ARRAY_B64_ZERO
4907   : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>;
4909 defm SUST_P_2D_ARRAY_B8_TRAP
4910   : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>;
4911 defm SUST_P_2D_ARRAY_B16_TRAP
4912   : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>;
4913 defm SUST_P_2D_ARRAY_B32_TRAP
4914   : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>;
4916 class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
4917     : NVPTXInst<(outs),
4918                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
4919                                 intype:$r, intype:$g)),
4920                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};",
4921                 []>;
4922 multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
4923   def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
4924   def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
4927 defm SUST_B_2D_ARRAY_V2B8_CLAMP
4928   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>;
4929 defm SUST_B_2D_ARRAY_V2B16_CLAMP
4930   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>;
4931 defm SUST_B_2D_ARRAY_V2B32_CLAMP
4932   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>;
4933 defm SUST_B_2D_ARRAY_V2B64_CLAMP
4934   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>;
4936 defm SUST_B_2D_ARRAY_V2B8_TRAP
4937   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>;
4938 defm SUST_B_2D_ARRAY_V2B16_TRAP
4939   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>;
4940 defm SUST_B_2D_ARRAY_V2B32_TRAP
4941   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>;
4942 defm SUST_B_2D_ARRAY_V2B64_TRAP
4943   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>;
4945 defm SUST_B_2D_ARRAY_V2B8_ZERO
4946   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>;
4947 defm SUST_B_2D_ARRAY_V2B16_ZERO
4948   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>;
4949 defm SUST_B_2D_ARRAY_V2B32_ZERO
4950   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>;
4951 defm SUST_B_2D_ARRAY_V2B64_ZERO
4952   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>;
4954 defm SUST_P_2D_ARRAY_V2B8_TRAP
4955   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>;
4956 defm SUST_P_2D_ARRAY_V2B16_TRAP
4957   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>;
4958 defm SUST_P_2D_ARRAY_V2B32_TRAP
4959   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>;
4961 class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
4962     : NVPTXInst<(outs),
4963                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
4964                                 intype:$r, intype:$g, intype:$b, intype:$a)),
4965                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};",
4966                 []>;
4967 multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
4968   def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
4969   def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
4972 defm SUST_B_2D_ARRAY_V4B8_CLAMP
4973   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>;
4974 defm SUST_B_2D_ARRAY_V4B16_CLAMP
4975   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>;
4976 defm SUST_B_2D_ARRAY_V4B32_CLAMP
4977   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>;
4979 defm SUST_B_2D_ARRAY_V4B8_TRAP
4980   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>;
4981 defm SUST_B_2D_ARRAY_V4B16_TRAP
4982   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>;
4983 defm SUST_B_2D_ARRAY_V4B32_TRAP
4984   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>;
4986 defm SUST_B_2D_ARRAY_V4B8_ZERO
4987   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>;
4988 defm SUST_B_2D_ARRAY_V4B16_ZERO
4989   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>;
4990 defm SUST_B_2D_ARRAY_V4B32_ZERO
4991   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>;
4993 defm SUST_P_2D_ARRAY_V4B8_TRAP
4994   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>;
4995 defm SUST_P_2D_ARRAY_V4B16_TRAP
4996   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>;
4997 defm SUST_P_2D_ARRAY_V4B32_TRAP
4998   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>;
5000 class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf>
5001     : NVPTXInst<(outs),
5002                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5003                                 intype:$r)),
5004                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
5005                 []>;
5006 multiclass SUST_3D<string inst, NVPTXRegClass intype> {
5007   def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s)>;
5008   def _I : SUST_3D_base<inst, intype, (ins i64imm:$s)>;
5011 defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
5012 defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
5013 defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
5014 defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
5016 defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
5017 defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
5018 defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
5019 defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
5021 defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
5022 defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
5023 defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
5024 defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
5026 defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
5027 defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
5028 defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
5030 class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf>
5031     : NVPTXInst<(outs),
5032                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5033                                 intype:$r, intype:$g)),
5034                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};",
5035                 []>;
5036 multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
5037   def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s)>;
5038   def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s)>;
5041 defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
5042 defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
5043 defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
5044 defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
5046 defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
5047 defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
5048 defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
5049 defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
5051 defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
5052 defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
5053 defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
5054 defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
5056 defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
5057 defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
5058 defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
5060 class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf>
5061     : NVPTXInst<(outs),
5062                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5063                                 intype:$r, intype:$g, intype:$b, intype:$a)),
5064                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};",
5065                 []>;
5066 multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
5067   def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s)>;
5068   def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s)>;
5071 defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
5072 defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
5073 defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
5075 defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
5076 defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
5077 defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
5079 defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
5080 defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
5081 defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
5083 defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
5084 defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
5085 defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
5089 // Surface store instruction patterns
5090 // I'm not sure why we can't just include these in the instruction definitions,
5091 // but TableGen complains of type errors :(
5093 // .clamp variant
5094 def : Pat<(int_nvvm_sust_b_1d_i8_clamp
5095            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5096           (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5098 def : Pat<(int_nvvm_sust_b_1d_i16_clamp
5099            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5100           (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5102 def : Pat<(int_nvvm_sust_b_1d_i32_clamp
5103            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
5104           (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
5106 def : Pat<(int_nvvm_sust_b_1d_i64_clamp
5107            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
5108           (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
5110 def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
5111            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5112           (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5113            Int16Regs:$r, Int16Regs:$g)>;
5115 def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
5116            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5117           (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5118            Int16Regs:$r, Int16Regs:$g)>;
5120 def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
5121            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5122           (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5123            Int32Regs:$r, Int32Regs:$g)>;
5125 def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
5126            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5127           (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5128            Int64Regs:$r, Int64Regs:$g)>;
5130 def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
5131            Int64Regs:$s, Int32Regs:$x,
5132            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5133           (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5134            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5136 def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
5137            Int64Regs:$s, Int32Regs:$x,
5138            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5139           (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5140            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5142 def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
5143            Int64Regs:$s, Int32Regs:$x,
5144            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5145           (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
5146            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5150 def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
5151            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5152           (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5153            Int16Regs:$r)>;
5155 def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
5156            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5157           (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5158            Int16Regs:$r)>;
5160 def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
5161            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
5162           (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5163            Int32Regs:$r)>;
5165 def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
5166            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
5167           (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5168            Int64Regs:$r)>;
5170 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
5171           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5172           (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5173            Int16Regs:$r, Int16Regs:$g)>;
5175 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
5176           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5177           (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5178            Int16Regs:$r, Int16Regs:$g)>;
5180 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
5181           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5182           (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5183            Int32Regs:$r, Int32Regs:$g)>;
5185 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
5186           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5187           (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5188            Int64Regs:$r, Int64Regs:$g)>;
5190 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
5191            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5192            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5193           (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5194            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5196 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
5197            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5198            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5199           (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5200            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5202 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
5203            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5204            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5205           (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5206            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5210 def : Pat<(int_nvvm_sust_b_2d_i8_clamp
5211            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5212           (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5213            Int16Regs:$r)>;
5215 def : Pat<(int_nvvm_sust_b_2d_i16_clamp
5216            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5217           (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5218            Int16Regs:$r)>;
5220 def : Pat<(int_nvvm_sust_b_2d_i32_clamp
5221            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5222           (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5223            Int32Regs:$r)>;
5225 def : Pat<(int_nvvm_sust_b_2d_i64_clamp
5226            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5227           (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5228            Int64Regs:$r)>;
5230 def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
5231           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5232           (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5233            Int16Regs:$r, Int16Regs:$g)>;
5235 def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
5236           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5237           (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5238            Int16Regs:$r, Int16Regs:$g)>;
5240 def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
5241           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
5242           (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5243            Int32Regs:$r, Int32Regs:$g)>;
5245 def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
5246           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
5247           (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5248            Int64Regs:$r, Int64Regs:$g)>;
5250 def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
5251            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5252            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5253           (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5254            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5256 def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
5257            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5258            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5259           (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5260            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5262 def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
5263            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5264            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5265           (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5266            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5270 def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
5271           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5272           (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s,
5273            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5274            Int16Regs:$r)>;
5276 def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
5277           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5278           (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s,
5279            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5280            Int16Regs:$r)>;
5282 def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
5283           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5284           (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s,
5285            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5286            Int32Regs:$r)>;
5288 def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
5289           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5290           (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s,
5291            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5292            Int64Regs:$r)>;
5294 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
5295            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5296            Int16Regs:$r, Int16Regs:$g),
5297           (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l,
5298            Int32Regs:$x, Int32Regs:$y,
5299            Int16Regs:$r, Int16Regs:$g)>;
5301 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
5302            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5303            Int16Regs:$r, Int16Regs:$g),
5304           (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l,
5305            Int32Regs:$x, Int32Regs:$y,
5306            Int16Regs:$r, Int16Regs:$g)>;
5308 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
5309            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
5310            Int32Regs:$g),
5311           (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
5312            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
5314 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
5315            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
5316            Int64Regs:$g),
5317           (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l,
5318            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
5320 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
5321            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5322            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5323           (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s,
5324            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5325            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5327 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
5328            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5329            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5330           (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s,
5331            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5332            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5334 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
5335            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5336            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5337           (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
5338            Int32Regs:$x, Int32Regs:$y,
5339            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5343 def : Pat<(int_nvvm_sust_b_3d_i8_clamp
5344            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5345            Int16Regs:$r),
5346           (SUST_B_3D_B8_CLAMP_R Int64Regs:$s,
5347            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5348            Int16Regs:$r)>;
5350 def : Pat<(int_nvvm_sust_b_3d_i16_clamp
5351            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5352            Int16Regs:$r),
5353           (SUST_B_3D_B16_CLAMP_R Int64Regs:$s,
5354            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5355            Int16Regs:$r)>;
5357 def : Pat<(int_nvvm_sust_b_3d_i32_clamp
5358            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5359            Int32Regs:$r),
5360           (SUST_B_3D_B32_CLAMP_R Int64Regs:$s,
5361            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5362            Int32Regs:$r)>;
5364 def : Pat<(int_nvvm_sust_b_3d_i64_clamp
5365            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5366            Int64Regs:$r),
5367           (SUST_B_3D_B64_CLAMP_R Int64Regs:$s,
5368            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5369            Int64Regs:$r)>;
5371 def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
5372            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5373            Int16Regs:$r, Int16Regs:$g),
5374           (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s,
5375            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5376            Int16Regs:$r, Int16Regs:$g)>;
5378 def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
5379            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5380            Int16Regs:$r, Int16Regs:$g),
5381           (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s,
5382            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5383            Int16Regs:$r, Int16Regs:$g)>;
5385 def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
5386            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5387            Int32Regs:$r, Int32Regs:$g),
5388           (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s,
5389            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5390            Int32Regs:$r, Int32Regs:$g)>;
5392 def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
5393            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5394            Int64Regs:$r, Int64Regs:$g),
5395           (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s,
5396            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5397            Int64Regs:$r, Int64Regs:$g)>;
5399 def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
5400            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5401            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5402           (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s,
5403            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5404            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5406 def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
5407            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5408            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5409           (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s,
5410            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5411            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5413 def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
5414            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5415            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5416           (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s,
5417            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5418            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5421 // .trap variant
5422 def : Pat<(int_nvvm_sust_b_1d_i8_trap
5423            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5424           (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5426 def : Pat<(int_nvvm_sust_b_1d_i16_trap
5427            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5428           (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5430 def : Pat<(int_nvvm_sust_b_1d_i32_trap
5431            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
5432           (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
5434 def : Pat<(int_nvvm_sust_b_1d_i64_trap
5435            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
5436           (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
5438 def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
5439            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5440           (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
5441            Int16Regs:$r, Int16Regs:$g)>;
5443 def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
5444            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5445           (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
5446            Int16Regs:$r, Int16Regs:$g)>;
5448 def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
5449            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5450           (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
5451            Int32Regs:$r, Int32Regs:$g)>;
5453 def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
5454            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5455           (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x,
5456            Int64Regs:$r, Int64Regs:$g)>;
5458 def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
5459            Int64Regs:$s, Int32Regs:$x,
5460            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5461           (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
5462            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5464 def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
5465            Int64Regs:$s, Int32Regs:$x,
5466            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5467           (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
5468            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5470 def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
5471            Int64Regs:$s, Int32Regs:$x,
5472            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5473           (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
5474            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5478 def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
5479            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5480           (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5481            Int16Regs:$r)>;
5483 def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
5484            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5485           (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5486            Int16Regs:$r)>;
5488 def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
5489            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
5490           (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5491            Int32Regs:$r)>;
5493 def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
5494            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
5495           (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5496            Int64Regs:$r)>;
5498 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
5499           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5500           (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5501            Int16Regs:$r, Int16Regs:$g)>;
5503 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
5504           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5505           (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5506            Int16Regs:$r, Int16Regs:$g)>;
5508 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
5509           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5510           (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5511            Int32Regs:$r, Int32Regs:$g)>;
5513 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
5514           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5515           (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5516            Int64Regs:$r, Int64Regs:$g)>;
5518 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
5519            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5520            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5521           (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5522            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5524 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
5525            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5526            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5527           (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5528            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5530 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
5531            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5532            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5533           (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5534            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5538 def : Pat<(int_nvvm_sust_b_2d_i8_trap
5539            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5540           (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5541            Int16Regs:$r)>;
5543 def : Pat<(int_nvvm_sust_b_2d_i16_trap
5544            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5545           (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5546            Int16Regs:$r)>;
5548 def : Pat<(int_nvvm_sust_b_2d_i32_trap
5549            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5550           (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5551            Int32Regs:$r)>;
5553 def : Pat<(int_nvvm_sust_b_2d_i64_trap
5554            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5555           (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5556            Int64Regs:$r)>;
5558 def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
5559           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5560           (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5561            Int16Regs:$r, Int16Regs:$g)>;
5563 def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
5564           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5565           (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5566            Int16Regs:$r, Int16Regs:$g)>;
5568 def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
5569           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
5570           (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5571            Int32Regs:$r, Int32Regs:$g)>;
5573 def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
5574           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
5575           (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5576            Int64Regs:$r, Int64Regs:$g)>;
5578 def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
5579            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5580            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5581           (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5582            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5584 def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
5585            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5586            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5587           (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5588            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5590 def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
5591            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5592            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5593           (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5594            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5598 def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
5599           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5600           (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
5601            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5602            Int16Regs:$r)>;
5604 def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
5605           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5606           (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
5607            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5608            Int16Regs:$r)>;
5610 def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
5611           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5612           (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
5613            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5614            Int32Regs:$r)>;
5616 def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
5617           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5618           (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s,
5619            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5620            Int64Regs:$r)>;
5622 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
5623            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5624            Int16Regs:$r, Int16Regs:$g),
5625           (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
5626            Int32Regs:$x, Int32Regs:$y,
5627            Int16Regs:$r, Int16Regs:$g)>;
5629 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
5630            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5631            Int16Regs:$r, Int16Regs:$g),
5632           (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
5633            Int32Regs:$x, Int32Regs:$y,
5634            Int16Regs:$r, Int16Regs:$g)>;
5636 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
5637            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
5638            Int32Regs:$g),
5639           (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
5640            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
5642 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
5643            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
5644            Int64Regs:$g),
5645           (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l,
5646            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
5648 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
5649            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5650            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5651           (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
5652            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5653            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5655 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
5656            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5657            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5658           (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
5659            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5660            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5662 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
5663            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5664            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5665           (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
5666            Int32Regs:$x, Int32Regs:$y,
5667            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5671 def : Pat<(int_nvvm_sust_b_3d_i8_trap
5672            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5673            Int16Regs:$r),
5674           (SUST_B_3D_B8_TRAP_R Int64Regs:$s,
5675            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5676            Int16Regs:$r)>;
5678 def : Pat<(int_nvvm_sust_b_3d_i16_trap
5679            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5680            Int16Regs:$r),
5681           (SUST_B_3D_B16_TRAP_R Int64Regs:$s,
5682            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5683            Int16Regs:$r)>;
5685 def : Pat<(int_nvvm_sust_b_3d_i32_trap
5686            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5687            Int32Regs:$r),
5688           (SUST_B_3D_B32_TRAP_R Int64Regs:$s,
5689            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5690            Int32Regs:$r)>;
5692 def : Pat<(int_nvvm_sust_b_3d_i64_trap
5693            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5694            Int64Regs:$r),
5695           (SUST_B_3D_B64_TRAP_R Int64Regs:$s,
5696            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5697            Int64Regs:$r)>;
5699 def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
5700            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5701            Int16Regs:$r, Int16Regs:$g),
5702           (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s,
5703            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5704            Int16Regs:$r, Int16Regs:$g)>;
5706 def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
5707            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5708            Int16Regs:$r, Int16Regs:$g),
5709           (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s,
5710            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5711            Int16Regs:$r, Int16Regs:$g)>;
5713 def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
5714            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5715            Int32Regs:$r, Int32Regs:$g),
5716           (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s,
5717            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5718            Int32Regs:$r, Int32Regs:$g)>;
5720 def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
5721            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5722            Int64Regs:$r, Int64Regs:$g),
5723           (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s,
5724            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5725            Int64Regs:$r, Int64Regs:$g)>;
5727 def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
5728            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5729            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5730           (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s,
5731            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5732            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5734 def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
5735            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5736            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5737           (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s,
5738            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5739            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5741 def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
5742            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5743            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5744           (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s,
5745            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
5746            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5749 // .zero variant
5750 def : Pat<(int_nvvm_sust_b_1d_i8_zero
5751            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5752           (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5754 def : Pat<(int_nvvm_sust_b_1d_i16_zero
5755            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
5756           (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
5758 def : Pat<(int_nvvm_sust_b_1d_i32_zero
5759            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
5760           (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
5762 def : Pat<(int_nvvm_sust_b_1d_i64_zero
5763            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
5764           (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
5766 def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
5767            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5768           (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
5769            Int16Regs:$r, Int16Regs:$g)>;
5771 def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
5772            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5773           (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
5774            Int16Regs:$r, Int16Regs:$g)>;
5776 def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
5777            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5778           (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
5779            Int32Regs:$r, Int32Regs:$g)>;
5781 def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
5782            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5783           (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x,
5784            Int64Regs:$r, Int64Regs:$g)>;
5786 def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
5787            Int64Regs:$s, Int32Regs:$x,
5788            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5789           (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
5790            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5792 def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
5793            Int64Regs:$s, Int32Regs:$x,
5794            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5795           (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
5796            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5798 def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
5799            Int64Regs:$s, Int32Regs:$x,
5800            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5801           (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
5802            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5806 def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
5807            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5808           (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5809            Int16Regs:$r)>;
5811 def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
5812            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
5813           (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5814            Int16Regs:$r)>;
5816 def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
5817            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
5818           (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5819            Int32Regs:$r)>;
5821 def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
5822            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
5823           (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5824            Int64Regs:$r)>;
5826 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
5827           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5828           (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5829            Int16Regs:$r, Int16Regs:$g)>;
5831 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
5832           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
5833           (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5834            Int16Regs:$r, Int16Regs:$g)>;
5836 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
5837           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
5838           (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5839            Int32Regs:$r, Int32Regs:$g)>;
5841 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
5842           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
5843           (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5844            Int64Regs:$r, Int64Regs:$g)>;
5846 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
5847            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5848            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5849           (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5850            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5852 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
5853            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5854            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5855           (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5856            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5858 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
5859            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5860            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5861           (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
5862            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5866 def : Pat<(int_nvvm_sust_b_2d_i8_zero
5867            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5868           (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5869            Int16Regs:$r)>;
5871 def : Pat<(int_nvvm_sust_b_2d_i16_zero
5872            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5873           (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5874            Int16Regs:$r)>;
5876 def : Pat<(int_nvvm_sust_b_2d_i32_zero
5877            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5878           (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5879            Int32Regs:$r)>;
5881 def : Pat<(int_nvvm_sust_b_2d_i64_zero
5882            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5883           (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5884            Int64Regs:$r)>;
5886 def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
5887           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5888           (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5889            Int16Regs:$r, Int16Regs:$g)>;
5891 def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
5892           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
5893           (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5894            Int16Regs:$r, Int16Regs:$g)>;
5896 def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
5897           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
5898           (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5899            Int32Regs:$r, Int32Regs:$g)>;
5901 def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
5902           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
5903           (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5904            Int64Regs:$r, Int64Regs:$g)>;
5906 def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
5907            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5908            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5909           (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5910            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5912 def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
5913            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5914            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5915           (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5916            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5918 def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
5919            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5920            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5921           (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
5922            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5926 def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
5927           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5928           (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s,
5929            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5930            Int16Regs:$r)>;
5932 def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
5933           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
5934           (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s,
5935            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5936            Int16Regs:$r)>;
5938 def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
5939           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
5940           (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s,
5941            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5942            Int32Regs:$r)>;
5944 def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
5945           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
5946           (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s,
5947            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5948            Int64Regs:$r)>;
5950 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
5951            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5952            Int16Regs:$r, Int16Regs:$g),
5953           (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l,
5954            Int32Regs:$x, Int32Regs:$y,
5955            Int16Regs:$r, Int16Regs:$g)>;
5957 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
5958            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5959            Int16Regs:$r, Int16Regs:$g),
5960           (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l,
5961            Int32Regs:$x, Int32Regs:$y,
5962            Int16Regs:$r, Int16Regs:$g)>;
5964 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
5965            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
5966            Int32Regs:$g),
5967           (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
5968            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
5970 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
5971            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
5972            Int64Regs:$g),
5973           (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l,
5974            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
5976 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
5977            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5978            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5979           (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s,
5980            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5981            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5983 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
5984            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5985            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
5986           (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s,
5987            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5988            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
5990 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
5991            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
5992            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
5993           (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
5994            Int32Regs:$x, Int32Regs:$y,
5995            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
5999 def : Pat<(int_nvvm_sust_b_3d_i8_zero
6000            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6001            Int16Regs:$r),
6002           (SUST_B_3D_B8_ZERO_R Int64Regs:$s,
6003            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6004            Int16Regs:$r)>;
6006 def : Pat<(int_nvvm_sust_b_3d_i16_zero
6007            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6008            Int16Regs:$r),
6009           (SUST_B_3D_B16_ZERO_R Int64Regs:$s,
6010            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6011            Int16Regs:$r)>;
6013 def : Pat<(int_nvvm_sust_b_3d_i32_zero
6014            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6015            Int32Regs:$r),
6016           (SUST_B_3D_B32_ZERO_R Int64Regs:$s,
6017            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6018            Int32Regs:$r)>;
6020 def : Pat<(int_nvvm_sust_b_3d_i64_zero
6021            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6022            Int64Regs:$r),
6023           (SUST_B_3D_B64_ZERO_R Int64Regs:$s,
6024            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6025            Int64Regs:$r)>;
6027 def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
6028            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6029            Int16Regs:$r, Int16Regs:$g),
6030           (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s,
6031            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6032            Int16Regs:$r, Int16Regs:$g)>;
6034 def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
6035            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6036            Int16Regs:$r, Int16Regs:$g),
6037           (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s,
6038            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6039            Int16Regs:$r, Int16Regs:$g)>;
6041 def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
6042            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6043            Int32Regs:$r, Int32Regs:$g),
6044           (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s,
6045            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6046            Int32Regs:$r, Int32Regs:$g)>;
6048 def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
6049            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6050            Int64Regs:$r, Int64Regs:$g),
6051           (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s,
6052            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6053            Int64Regs:$r, Int64Regs:$g)>;
6055 def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
6056            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6057            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6058           (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s,
6059            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6060            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6062 def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
6063            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6064            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6065           (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s,
6066            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6067            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6069 def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
6070            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6071            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6072           (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s,
6073            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6074            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6079 def : Pat<(int_nvvm_sust_p_1d_i8_trap
6080            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
6081           (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
6083 def : Pat<(int_nvvm_sust_p_1d_i16_trap
6084            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
6085           (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
6087 def : Pat<(int_nvvm_sust_p_1d_i32_trap
6088            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
6089           (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
6091 def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
6092            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
6093           (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
6094            Int16Regs:$r, Int16Regs:$g)>;
6096 def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
6097            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
6098           (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
6099            Int16Regs:$r, Int16Regs:$g)>;
6101 def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
6102            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
6103           (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
6104            Int32Regs:$r, Int32Regs:$g)>;
6106 def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
6107            Int64Regs:$s, Int32Regs:$x,
6108            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6109           (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
6110            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6112 def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
6113            Int64Regs:$s, Int32Regs:$x,
6114            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6115           (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
6116            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6118 def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
6119            Int64Regs:$s, Int32Regs:$x,
6120            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6121           (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
6122            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6126 def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
6127            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
6128           (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6129            Int16Regs:$r)>;
6131 def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
6132            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
6133           (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6134            Int16Regs:$r)>;
6136 def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
6137            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
6138           (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6139            Int32Regs:$r)>;
6141 def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
6142           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
6143           (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6144            Int16Regs:$r, Int16Regs:$g)>;
6146 def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
6147           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
6148           (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6149            Int16Regs:$r, Int16Regs:$g)>;
6151 def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
6152           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
6153           (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6154            Int32Regs:$r, Int32Regs:$g)>;
6156 def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
6157            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6158            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6159           (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6160            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6162 def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
6163            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6164            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6165           (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6166            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6168 def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
6169            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6170            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6171           (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
6172            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6176 def : Pat<(int_nvvm_sust_p_2d_i8_trap
6177            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
6178           (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6179            Int16Regs:$r)>;
6181 def : Pat<(int_nvvm_sust_p_2d_i16_trap
6182            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
6183           (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6184            Int16Regs:$r)>;
6186 def : Pat<(int_nvvm_sust_p_2d_i32_trap
6187            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
6188           (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6189            Int32Regs:$r)>;
6191 def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
6192           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
6193           (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6194            Int16Regs:$r, Int16Regs:$g)>;
6196 def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
6197           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
6198           (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6199            Int16Regs:$r, Int16Regs:$g)>;
6201 def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
6202           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
6203           (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6204            Int32Regs:$r, Int32Regs:$g)>;
6206 def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
6207            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6208            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6209           (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6210            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6212 def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
6213            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6214            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6215           (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6216            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6218 def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
6219            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6220            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6221           (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
6222            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6226 def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
6227           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
6228           (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
6229            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6230            Int16Regs:$r)>;
6232 def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
6233           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
6234           (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
6235            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6236            Int16Regs:$r)>;
6238 def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
6239           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
6240           (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
6241            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6242            Int32Regs:$r)>;
6244 def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
6245            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6246            Int16Regs:$r, Int16Regs:$g),
6247           (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
6248            Int32Regs:$x, Int32Regs:$y,
6249            Int16Regs:$r, Int16Regs:$g)>;
6251 def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
6252            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6253            Int16Regs:$r, Int16Regs:$g),
6254           (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
6255            Int32Regs:$x, Int32Regs:$y,
6256            Int16Regs:$r, Int16Regs:$g)>;
6258 def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
6259            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
6260            Int32Regs:$g),
6261           (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
6262            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
6264 def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
6265            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6266            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6267           (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
6268            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6269            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6271 def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
6272            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6273            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6274           (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
6275            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6276            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6278 def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
6279            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
6280            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6281           (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
6282            Int32Regs:$x, Int32Regs:$y,
6283            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6287 def : Pat<(int_nvvm_sust_p_3d_i8_trap
6288            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6289            Int16Regs:$r),
6290           (SUST_P_3D_B8_TRAP_R Int64Regs:$s,
6291            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6292            Int16Regs:$r)>;
6294 def : Pat<(int_nvvm_sust_p_3d_i16_trap
6295            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6296            Int16Regs:$r),
6297           (SUST_P_3D_B16_TRAP_R Int64Regs:$s,
6298            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6299            Int16Regs:$r)>;
6301 def : Pat<(int_nvvm_sust_p_3d_i32_trap
6302            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6303            Int32Regs:$r),
6304           (SUST_P_3D_B32_TRAP_R Int64Regs:$s,
6305            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6306            Int32Regs:$r)>;
6308 def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
6309            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6310            Int16Regs:$r, Int16Regs:$g),
6311           (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s,
6312            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6313            Int16Regs:$r, Int16Regs:$g)>;
6315 def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
6316            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6317            Int16Regs:$r, Int16Regs:$g),
6318           (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s,
6319            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6320            Int16Regs:$r, Int16Regs:$g)>;
6322 def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
6323            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6324            Int32Regs:$r, Int32Regs:$g),
6325           (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s,
6326            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6327            Int32Regs:$r, Int32Regs:$g)>;
6329 def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
6330            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6331            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6332           (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s,
6333            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6334            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6336 def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
6337            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6338            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
6339           (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s,
6340            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6341            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
6343 def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
6344            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6345            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
6346           (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s,
6347            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
6348            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
6350 //-----------------------------------
6351 // Read Special Registers
6352 //-----------------------------------
6354 class PTX_READ_SREG_R64<string regname, Intrinsic intop, list<Predicate> Preds=[]>
6355   : NVPTXInst<(outs Int64Regs:$d), (ins),
6356               !strconcat("mov.u64 \t$d, %", regname, ";"),
6357               [(set Int64Regs:$d, (intop))]>,
6358     Requires<Preds>;
6360 class PTX_READ_SREG_R32<string regname, Intrinsic intop, list<Predicate> Preds=[]>
6361   : NVPTXInst<(outs Int32Regs:$d), (ins),
6362               !strconcat("mov.u32 \t$d, %", regname, ";"),
6363               [(set Int32Regs:$d, (intop))]>,
6364     Requires<Preds>;
6366 multiclass PTX_READ_SREG_R32V4<string regname, list<Predicate> Preds=[]> {
6367    foreach suffix = ["x", "y", "z", "w"] in {
6368       defvar reg = regname # "." # suffix;
6369       defvar intr = !cast<Intrinsic>("int_nvvm_read_ptx_sreg_" # regname # "_" # suffix);
6370       def "_"#suffix :  PTX_READ_SREG_R32<reg, intr, Preds>;
6371    }
6374 // TODO Add read vector-version of special registers
6376 defm INT_PTX_SREG_TID   : PTX_READ_SREG_R32V4<"tid">;
6377 defm INT_PTX_SREG_NTID  : PTX_READ_SREG_R32V4<"ntid">;
6378 defm INT_PTX_SREG_CTAID : PTX_READ_SREG_R32V4<"ctaid">;
6379 defm INT_PTX_SREG_NCTAID: PTX_READ_SREG_R32V4<"nctaid">;
6381 defm INT_PTX_SREG_CLUSTERID :
6382        PTX_READ_SREG_R32V4<"clusterid", [hasSM<90>, hasPTX<78>]>;
6383 defm INT_PTX_SREG_NCLUSTERID :
6384        PTX_READ_SREG_R32V4<"nclusterid", [hasSM<90>, hasPTX<78>]>;
6385 defm INT_PTX_SREG_CLUSTER_CTAID :
6386        PTX_READ_SREG_R32V4<"cluster_ctaid", [hasSM<90>, hasPTX<78>]>;
6387 defm INT_PTX_SREG_CLUSTER_NCTAID:
6388        PTX_READ_SREG_R32V4<"cluster_nctaid", [hasSM<90>, hasPTX<78>]>;
6390 def  INT_PTX_SREG_CLUSTER_CTARANK :
6391        PTX_READ_SREG_R32<"cluster_ctarank",
6392                          int_nvvm_read_ptx_sreg_cluster_ctarank,
6393                          [hasSM<90>, hasPTX<78>]>;
6394 def  INT_PTX_SREG_CLUSTER_NCTARANK:
6395        PTX_READ_SREG_R32<"cluster_nctarank",
6396                          int_nvvm_read_ptx_sreg_cluster_nctarank,
6397                          [hasSM<90>, hasPTX<78>]>;
6400 def INT_PTX_SREG_LANEID :
6401     PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
6402 def INT_PTX_SREG_WARPID :
6403     PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
6404 def INT_PTX_SREG_NWARPID :
6405     PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>;
6406 def INT_PTX_SREG_SMID :
6407     PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>;
6408 def INT_PTX_SREG_NSMID :
6409     PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>;
6410 def INT_PTX_SREG_GRIDID :
6411     PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>;
6413 def INT_PTX_SREG_LANEMASK_EQ :
6414     PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>;
6415 def INT_PTX_SREG_LANEMASK_LE :
6416     PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>;
6417 def INT_PTX_SREG_LANEMASK_LT :
6418     PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>;
6419 def INT_PTX_SREG_LANEMASK_GE :
6420     PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>;
6421 def INT_PTX_SREG_LANEMASK_GT :
6422     PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
6424 let hasSideEffects = 1 in {
6425 def INT_PTX_SREG_CLOCK :
6426     PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
6427 def INT_PTX_SREG_CLOCK64 :
6428     PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
6429 def INT_PTX_SREG_GLOBALTIMER :
6430     PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
6433 def: Pat <(i64 (readcyclecounter)), (INT_PTX_SREG_CLOCK64)>;
6434 def: Pat <(i64 (readsteadycounter)), (INT_PTX_SREG_GLOBALTIMER)>;
6436 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
6437 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
6438 def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
6439 def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
6441 // TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
6442 // handle the constant.
6443 def INT_PTX_SREG_WARPSIZE :
6444     NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
6445               [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
6447 // Helper class that represents a 'fragment' of an NVPTX *MMA instruction.
6448 // In addition to target-independent fields provided by WMMA_REGS, it adds
6449 // the fields commonly used to implement specific PTX instruction -- register
6450 // types and names, constraints, parts of assembly, etc.
6451 class WMMA_REGINFO<WMMA_REGS r, string op>
6452       : WMMA_REGS<r.geom, r.frag, r.ptx_elt_type> {
6453   // NVPTX register types used to carry fragment data.
6454   NVPTXRegClass regclass = !cond(
6455     !eq(ptx_elt_type, "f16") : Int32Regs,
6456     !eq(ptx_elt_type, "f32") : Float32Regs,
6457     !eq(ptx_elt_type, "f64") : Float64Regs,
6458     !eq(ptx_elt_type, "bf16") : Int32Regs,
6459     !eq(ptx_elt_type, "tf32") : Int32Regs,
6460     !eq(ptx_elt_type, "s32") : Int32Regs,
6461     !eq(ptx_elt_type, "b16") : Int32Regs,
6462     !eq(ptx_elt_type, "s8") : Int32Regs,
6463     !eq(ptx_elt_type, "u8") : Int32Regs,
6464     !eq(ptx_elt_type, "s4") : Int32Regs,
6465     !eq(ptx_elt_type, "u4") : Int32Regs,
6466     !eq(ptx_elt_type, "b1") : Int32Regs);
6468   // Instruction input/output arguments for the fragment.
6469   list<NVPTXRegClass> ptx_regs = !listsplat(regclass, !size(regs));
6471   // List of register names for the fragment -- ["ra0", "ra1",...]
6472   list<string> reg_names = RegSeq<!size(ptx_regs), "r"#frag>.ret;
6474   // Generates "{{$r0, $r1,.... $rN-1}}" for use in asm string construction.
6475   string regstring = "{{$" # !interleave(reg_names, ", $") # "}}";
6477   // Predicates for particular fragment variant. Technically those are
6478   // per-instruction predicates, but currently all fragments that can be used in
6479   // a given instruction are subject to the same constraints, so an instruction
6480   // can use predicates from any of its fragments. If/when this is no
6481   // longer the case, we can concat all per-fragment predicates to enforce that
6482   // all fragments of the instruction are viable.
6483   list<Predicate> Predicates = !cond(
6484     // fp16 -> fp16/fp32 @ m16n16k16
6485     !and(!eq(geom, "m16n16k16"),
6486          !or(!eq(ptx_elt_type, "f16"),
6487              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<60>],
6489     !and(!eq(geom,"m8n8k4"),
6490          !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>],
6492     // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
6493     !and(!or(!eq(geom, "m8n32k16"),
6494              !eq(geom, "m32n8k16")),
6495          !or(!eq(ptx_elt_type, "f16"),
6496              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<61>],
6498     // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
6499     !and(!or(!eq(geom,"m16n16k16"),
6500              !eq(geom,"m8n32k16"),
6501              !eq(geom,"m32n8k16")),
6502          !or(!eq(ptx_elt_type, "u8"),
6503              !eq(ptx_elt_type, "s8"),
6504              !eq(ptx_elt_type, "s32"))) : [hasSM<72>, hasPTX<63>],
6506     !and(!or(!eq(geom,"m16n16k16"),
6507              !eq(geom,"m8n32k16"),
6508              !eq(geom,"m32n8k16")),
6509          !eq(ptx_elt_type, "bf16")) : [hasSM<80>, hasPTX<70>],
6511     !and(!eq(geom,"m16n16k8"),
6512          !eq(ptx_elt_type, "tf32")) : [hasSM<80>, hasPTX<70>],
6514     !and(!eq(geom,"m16n16k8"),
6515          !eq(ptx_elt_type, "f32")) : [hasSM<80>, hasPTX<70>],
6517     // b1 -> s32 @ m8n8k128(b1)
6518     !and(!ne(op,"mma"),
6519          !eq(geom,"m8n8k128")) : [hasSM<75>, hasPTX<63>],
6521     // u4/s4 -> s32 @ m8n8k32 (u4/s4)
6522     !and(!ne(op,"mma"),
6523          !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<63>],
6525     !or(!eq(geom,"m16n8k8"),
6526         !eq(geom,"m8n8k16")) : [hasSM<75>, hasPTX<65>],
6528     !and(!ne(ptx_elt_type,"f64"),
6529          !eq(geom, "m8n8k4")) : [hasSM<70>, hasPTX<64>],
6531     // mma m8n8k32 requires higher PTX version
6532     !and(!eq(op,"mma"),
6533          !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<65>],
6535     !and(!eq(ptx_elt_type,"f64"),
6536          !eq(geom, "m8n8k4")) : [hasSM<80>, hasPTX<70>],
6538     !and(!eq(op,"mma"),
6539          !or(!eq(geom, "m16n8k16"),
6540              !eq(geom, "m16n8k4"),
6541              !eq(geom, "m16n8k32"),
6542              !eq(geom, "m16n8k64"),
6543              !eq(geom, "m8n8k128"),
6544              !eq(geom, "m16n8k128"),
6545              !eq(geom, "m16n8k256"))) : [hasSM<80>, hasPTX<70>],
6547     !and(!eq(op,"ldmatrix"),
6548          !eq(ptx_elt_type,"b16"),
6549          !eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>]);
6551   // template DAGs for instruction inputs/output.
6552   dag Outs = !dag(outs, ptx_regs, reg_names);
6553   dag Ins = !dag(ins, ptx_regs, reg_names);
6556 // Convert dag of arguments into a dag to match given intrinsic.
6557 class BuildPatternI<Intrinsic Intr, dag Ins> {
6558   // Build a dag pattern that matches the intrinsic call.
6559   dag ret = !foreach(tmp, Ins,
6560                           !subst(imem, ADDRvar,
6561                           !subst(MEMri64, ADDRri64,
6562                           !subst(MEMri, ADDRri,
6563                           !subst(ins, Intr, tmp)))));
6566 // Same as above, but uses PatFrag instead of an Intrinsic.
6567 class BuildPatternPF<PatFrag Intr, dag Ins> {
6568   // Build a dag pattern that matches the intrinsic call.
6569   dag ret = !foreach(tmp, Ins,
6570                           !subst(imem, ADDRvar,
6571                           !subst(MEMri64, ADDRri64,
6572                           !subst(MEMri, ADDRri,
6573                           !subst(ins, Intr, tmp)))));
6576 // Common WMMA-related fields used for building patterns for all MMA instructions.
6577 class WMMA_INSTR<string _Intr, list<dag> _Args>
6578   : NVPTXInst<(outs), (ins), "?", []> {
6579   Intrinsic Intr = !cast<Intrinsic>(_Intr);
6580   // Concatenate all arguments into a single dag.
6581   dag Args = !foldl((ins), _Args, a, b, !con(a,b));
6582   // Pre-build the pattern to match (intrinsic arg0, arg1, ...).
6583   dag IntrinsicPattern = BuildPatternI<!cast<Intrinsic>(Intr), Args>.ret;
6587 // wmma.load.[a|b|c].sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
6590 class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride,
6591                 DAGOperand SrcOp>
6592   : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record,
6593                               [!con((ins SrcOp:$src),
6594                                     !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>,
6595     Requires<Frag.Predicates> {
6596   // Load/store intrinsics are overloaded on pointer's address space.
6597   // To match the right intrinsic, we need to build AS-constrained PatFrag.
6598   // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
6599   dag PFOperands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
6600   dag PFOperandsIntr = !if(WithStride, (Intr node:$src, node:$ldm), (Intr node:$src));
6601   // Build PatFrag that only matches particular address space.
6602   PatFrag IntrFrag = PatFrag<PFOperands,
6603                              PFOperandsIntr,
6604                              !cond(!eq(Space, ".shared"): AS_match.shared,
6605                                    !eq(Space, ".global"): AS_match.global,
6606                                    true: AS_match.generic)>;
6607   // Build AS-constrained pattern.
6608   let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
6610   let OutOperandList = Frag.Outs;
6611   let InOperandList = !con(Args, (ins MmaCode:$ptx));
6612   let AsmString = "wmma.load."
6613                   # Frag.frag
6614                   # ".sync"
6615                   # "${ptx:aligned}"
6616                   # "." # Layout
6617                   # "." # Frag.geom
6618                   # Space
6619                   # "." # Frag.ptx_elt_type # " \t"
6620                   # Frag.regstring
6621                   # ", [$src]"
6622                   # !if(WithStride, ", $ldm", "")
6623                   # ";";
6627 // wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
6629 class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
6630                    bit WithStride, DAGOperand DstOp>
6631   : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record,
6632                [!con((ins DstOp:$dst),
6633                      Frag.Ins,
6634                      !if(WithStride, (ins Int32Regs:$ldm), (ins)))]>,
6635     Requires<Frag.Predicates> {
6637   // Load/store intrinsics are overloaded on pointer's address space.
6638   // To match the right intrinsic, we need to build AS-constrained PatFrag.
6639   // Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
6640   dag PFOperands = !con((ops node:$dst),
6641                         !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names),
6642                         !if(WithStride, (ops node:$ldm), (ops)));
6643   // Build PatFrag that only matches particular address space.
6644   PatFrag IntrFrag = PatFrag<PFOperands,
6645                              !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
6646                              !cond(!eq(Space, ".shared"): AS_match.shared,
6647                                    !eq(Space, ".global"): AS_match.global,
6648                                    true: AS_match.generic)>;
6649   // Build AS-constrained pattern.
6650   let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
6652   let InOperandList  = !con(Args, (ins MmaCode:$ptx));
6653   let OutOperandList = (outs);
6654   let AsmString = "wmma.store.d.sync"
6655                   # "${ptx:aligned}"
6656                   # "." # Layout
6657                   # "." # Frag.geom
6658                   # Space
6659                   # "." # Frag.ptx_elt_type
6660                   # " \t[$dst],"
6661                   # Frag.regstring
6662                   # !if(WithStride, ", $ldm", "")
6663                   # ";";
6666 // Create all load/store variants
6667 defset list<WMMA_INSTR> MMA_LDSTs  = {
6668   foreach layout = ["row", "col"] in {
6669     foreach stride = [false, true] in {
6670       foreach space = [".global", ".shared", ""] in {
6671         foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
6672           foreach frag = NVVM_MMA_OPS.all_ld_ops in
6673             if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
6674               def : WMMA_LOAD<WMMA_REGINFO<frag, "load">, layout, space, stride, addr>;
6675           foreach frag = NVVM_MMA_OPS.all_st_ops in
6676             if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
6677               def : WMMA_STORE_D<WMMA_REGINFO<frag, "store">, layout, space, stride, addr>;
6678         } // addr
6679       } // space
6680     } // stride
6681   } // layout
6682 } // defset
6684 // B1 instruction variants need extra constraints.
6685 class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
6686   string Op = b1op;
6687   WMMA_REGINFO Frag = FragA;
6688   list<Predicate> ret = !listconcat(
6689     FragA.Predicates,
6690     !if(!eq(b1op, ".and.popc"), [hasSM<80>,hasPTX<71>],[])
6691   );
6693 // WMMA.MMA
6694 class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
6695                WMMA_REGINFO FragC, WMMA_REGINFO FragD,
6696                string ALayout, string BLayout, int Satfinite, string rnd, string b1op>
6697   : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record,
6698                          [FragA.Ins, FragB.Ins, FragC.Ins]>,
6699     // Requires does not seem to have effect on Instruction w/o Patterns.
6700     // We set it here anyways and propagate to the Pat<> we construct below.
6701     Requires<MMA_OP_PREDICATES<FragA, b1op>.ret> {
6702   let OutOperandList = FragD.Outs;
6703   let InOperandList  = !con(Args, (ins MmaCode:$ptx));
6704   string TypeList = !cond(
6705     !eq(FragA.ptx_elt_type, "f16") : "." # FragD.ptx_elt_type
6706                                      # "." # FragC.ptx_elt_type,
6707     1: "." # FragD.ptx_elt_type
6708        # "." # FragA.ptx_elt_type
6709        # "." # FragB.ptx_elt_type
6710        # "." # FragC.ptx_elt_type,
6711   );
6712   let AsmString = "wmma.mma"
6713                   # b1op
6714                   # ".sync"
6715                   # "${ptx:aligned}"
6716                   # "." # ALayout
6717                   # "." # BLayout
6718                   # "." # FragA.geom
6719                   # !if(!ne(rnd, ""), !strconcat(".", rnd), "")
6720                   # TypeList
6721                   # !if(Satfinite, ".satfinite", "") # "\n\t\t"
6722                   # FragD.regstring # ",\n\t\t"
6723                   # FragA.regstring # ",\n\t\t"
6724                   # FragB.regstring # ",\n\t\t"
6725                   # FragC.regstring # ";";
6728 let isConvergent = true in {
6729 defset list<WMMA_INSTR> WMMAs  = {
6730   foreach layout_a = ["row", "col"] in {
6731     foreach layout_b = ["row", "col"] in {
6732       foreach satf = [0, 1] in {
6733         foreach rnd = ["", "rn", "rz", "rm", "rp"] in {
6734           foreach op = NVVM_MMA_OPS.all_wmma_ops in {
6735             foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
6736               if NVVM_WMMA_SUPPORTED<op, layout_a, layout_b, satf, rnd>.ret then {
6737                 def : WMMA_MMA<WMMA_REGINFO<op[0], "wmma.mma">,
6738                               WMMA_REGINFO<op[1], "wmma.mma">,
6739                               WMMA_REGINFO<op[2], "wmma.mma">,
6740                               WMMA_REGINFO<op[3], "wmma.mma">,
6741                               layout_a, layout_b, satf, rnd, b1op>;
6742               }
6743             } // b1op
6744           } // op
6745         } // rnd
6746       } // satf
6747     } // layout_b
6748   } // layout_a
6749 } // defset
6752 // MMA
6753 class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
6754                WMMA_REGINFO FragC, WMMA_REGINFO FragD,
6755                string ALayout, string BLayout, int Satfinite, string b1op>
6756   : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, FragA, FragB, FragC, FragD>.record,
6757                         [FragA.Ins, FragB.Ins, FragC.Ins]>,
6758     // Requires does not seem to have effect on Instruction w/o Patterns.
6759     // We set it here anyways and propagate to the Pat<> we construct below.
6760   Requires<MMA_OP_PREDICATES<FragA, b1op>.ret> {
6761   let OutOperandList = FragD.Outs;
6762   let InOperandList  = !con(Args, (ins MmaCode:$ptx));
6763   string TypeList = "." # FragD.ptx_elt_type
6764                     # "." # FragA.ptx_elt_type
6765                     # "." # FragB.ptx_elt_type
6766                     # "." # FragC.ptx_elt_type;
6767   let AsmString = "mma.sync.aligned."
6768                   # FragA.geom
6769                   # "." # ALayout
6770                   # "." # BLayout
6771                   # !if(Satfinite, ".satfinite", "")
6772                   # TypeList
6773                   # b1op # "\n\t\t"
6774                   # FragD.regstring # ",\n\t\t"
6775                   # FragA.regstring # ",\n\t\t"
6776                   # FragB.regstring # ",\n\t\t"
6777                   # FragC.regstring # ";";
6780 let isConvergent = true in {
6781 defset list<WMMA_INSTR> MMAs  = {
6782   foreach layout_a = ["row", "col"] in {
6783     foreach layout_b = ["row", "col"] in {
6784       foreach satf = [0, 1] in {
6785         foreach op = NVVM_MMA_OPS.all_mma_ops in {
6786           foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
6787             if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
6788               def : MMA<WMMA_REGINFO<op[0], "mma">,
6789                         WMMA_REGINFO<op[1], "mma">,
6790                         WMMA_REGINFO<op[2], "mma">,
6791                         WMMA_REGINFO<op[3], "mma">,
6792                         layout_a, layout_b, satf, b1op>;
6793             }
6794           } // b1op
6795         } // op
6796       } // satf
6797     } // layout_b
6798   } // layout_a
6799 } // defset
6803 // ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
6805 class LDMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space,
6806                DAGOperand SrcOp>
6807   : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record, [(ins SrcOp:$src)]>,
6808     Requires<Frag.Predicates> {
6809   // Build PatFrag that only matches particular address space.
6810   PatFrag IntrFrag = PatFrag<(ops node:$src), (Intr node:$src),
6811                              !cond(!eq(Space, ".shared"): AS_match.shared,
6812                                    true: AS_match.generic)>;
6813   // Build AS-constrained pattern.
6814   let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
6816   let OutOperandList = Frag.Outs;
6817   let InOperandList = !con(Args, (ins MmaCode:$ptx));
6818   let AsmString = "ldmatrix.sync.aligned."
6819                   # Frag.geom
6820                   # "." # Frag.frag
6821                   # !if(Transposed, ".trans", "")
6822                   # Space
6823                   # "." # Frag.ptx_elt_type
6824                   # " " # Frag.regstring # ", [$src];";
6827 // Create all ldmatrix variants
6828 defset list<WMMA_INSTR> LDMATRIXs  = {
6829   foreach transposed = [false, true] in {
6830     foreach space = [".shared", ""] in {
6831       foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
6832         foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in
6833           if NVVM_LDMATRIX_SUPPORTED<frag>.ret then
6834             def : LDMATRIX<WMMA_REGINFO<frag, "ldmatrix">, transposed, space,
6835                             addr>;
6836       } // addr
6837     } // space
6838   } // transposed
6839 } // defset
6841 // Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
6842 // dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
6843 // the instruction record.
6844 class MMA_PAT<WMMA_INSTR wi>
6845       : Pat<wi.IntrinsicPattern,
6846             !con(!foreach(tmp, wi.Args, !subst(ins, wi, tmp)),
6847                  (wi ptx.version))>,
6848         Requires<wi.Predicates>;
6850 // Build intrinsic->instruction patterns for all MMA instructions.
6851 foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
6852   def : MMA_PAT<mma>;
6854 multiclass MAPA<string suffix, Intrinsic Intr> {
6855   def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, Int32Regs:$b),
6856               "mapa" # suffix # ".u32\t$d, $a, $b;",
6857               [(set Int32Regs:$d, (Intr Int32Regs:$a, Int32Regs:$b))]>,
6858     Requires<[hasSM<90>, hasPTX<78>]>;
6859   def _32i: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, i32imm:$b),
6860               "mapa" # suffix # ".u32\t$d, $a, $b;",
6861               [(set Int32Regs:$d, (Intr Int32Regs:$a, imm:$b))]>,
6862     Requires<[hasSM<90>, hasPTX<78>]>;
6863   def _64: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, Int32Regs:$b),
6864               "mapa" # suffix # ".u64\t$d, $a, $b;",
6865               [(set Int64Regs:$d, (Intr Int64Regs:$a, Int32Regs:$b))]>,
6866     Requires<[hasSM<90>, hasPTX<78>]>;
6867   def _64i: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, i32imm:$b),
6868               "mapa" # suffix # ".u64\t$d, $a, $b;",
6869               [(set Int64Regs:$d, (Intr Int64Regs:$a, imm:$b))]>,
6870     Requires<[hasSM<90>, hasPTX<78>]>;
6873 defm mapa  : MAPA<"", int_nvvm_mapa>;
6874 defm mapa_shared_cluster  : MAPA<".shared::cluster", int_nvvm_mapa_shared_cluster>;
6877 multiclass GETCTARANK<string suffix, Intrinsic Intr> {
6878   def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
6879               "getctarank" # suffix # ".u32\t$d, $a;",
6880               [(set Int32Regs:$d, (Intr Int32Regs:$a))]>,
6881     Requires<[hasSM<90>, hasPTX<78>]>;
6882   def _64: NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
6883               "getctarank" # suffix # ".u64\t$d, $a;",
6884               [(set Int32Regs:$d, (Intr Int64Regs:$a))]>,
6885     Requires<[hasSM<90>, hasPTX<78>]>;
6888 defm getctarank  : GETCTARANK<"", int_nvvm_getctarank>;
6889 defm getctarank_shared_cluster  : GETCTARANK<".shared::cluster", int_nvvm_getctarank_shared_cluster>;
6891 def is_explicit_cluster: NVPTXInst<(outs Int1Regs:$d), (ins),
6892               "mov.pred\t$d, %is_explicit_cluster;",
6893               [(set Int1Regs:$d, (int_nvvm_is_explicit_cluster))]>,
6894     Requires<[hasSM<90>, hasPTX<78>]>;
6896 // setmaxnreg inc/dec intrinsics
6897 let isConvergent = true in {
6898 multiclass SET_MAXNREG<string Action, Intrinsic Intr> {
6899   def : NVPTXInst<(outs), (ins i32imm:$reg_count),
6900           "setmaxnreg." # Action # ".sync.aligned.u32 $reg_count;",
6901           [(Intr timm:$reg_count)]>,
6902     Requires<[hasSM90a, hasPTX<80>]>;
6905 defm INT_SET_MAXNREG_INC : SET_MAXNREG<"inc", int_nvvm_setmaxnreg_inc_sync_aligned_u32>;
6906 defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_aligned_u32>;
6908 } // isConvergent
6910 def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;