1 // RUN: mlir-translate -mlir-to-llvmir %s -split-input-file --verify-diagnostics | FileCheck %s
3 // CHECK-LABEL: @nvvm_special_regs
4 llvm.func @nvvm_special_regs() -> i32 {
5 // CHECK: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
6 %1 = nvvm.read.ptx.sreg.tid.x : i32
7 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
8 %2 = nvvm.read.ptx.sreg.tid.y : i32
9 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
10 %3 = nvvm.read.ptx.sreg.tid.z : i32
11 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
12 %4 = nvvm.read.ptx.sreg.ntid.x : i32
13 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
14 %5 = nvvm.read.ptx.sreg.ntid.y : i32
15 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
16 %6 = nvvm.read.ptx.sreg.ntid.z : i32
17 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
18 %7 = nvvm.read.ptx.sreg.ctaid.x : i32
19 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
20 %8 = nvvm.read.ptx.sreg.ctaid.y : i32
21 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
22 %9 = nvvm.read.ptx.sreg.ctaid.z : i32
23 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
24 %10 = nvvm.read.ptx.sreg.nctaid.x : i32
25 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
26 %11 = nvvm.read.ptx.sreg.nctaid.y : i32
27 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
28 %12 = nvvm.read.ptx.sreg.nctaid.z : i32
29 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
30 %13 = nvvm.read.ptx.sreg.warpsize : i32
31 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
32 %14 = nvvm.read.ptx.sreg.laneid : i32
33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clusterid.x
34 %15 = nvvm.read.ptx.sreg.clusterid.x : i32
35 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clusterid.y
36 %16 = nvvm.read.ptx.sreg.clusterid.y : i32
37 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clusterid.z
38 %17 = nvvm.read.ptx.sreg.clusterid.z : i32
39 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x
40 %18 = nvvm.read.ptx.sreg.nclusterid.x : i32
41 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y
42 %19 = nvvm.read.ptx.sreg.nclusterid.y : i32
43 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z
44 %20 = nvvm.read.ptx.sreg.nclusterid.z : i32
45 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
46 %21 = nvvm.read.ptx.sreg.cluster.ctaid.x : i32
47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
48 %22 = nvvm.read.ptx.sreg.cluster.ctaid.y : i32
49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid
50 %23 = nvvm.read.ptx.sreg.cluster.ctaid.z : i32
51 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
52 %24 = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
53 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
54 %25 = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
55 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid
56 %26 = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
57 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank
58 %27 = nvvm.read.ptx.sreg.cluster.ctarank : i32
59 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank
60 %28 = nvvm.read.ptx.sreg.cluster.nctarank : i32
61 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock
62 %29 = nvvm.read.ptx.sreg.clock : i32
63 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64
64 %30 = nvvm.read.ptx.sreg.clock64 : i64
65 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer
66 %31 = nvvm.read.ptx.sreg.globaltimer : i64
67 // CHECK: %32 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
68 %32 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
69 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid
70 %33 = nvvm.read.ptx.sreg.warpid : i32
71 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid
72 %34 = nvvm.read.ptx.sreg.nwarpid : i32
73 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid
74 %35 = nvvm.read.ptx.sreg.smid : i32
75 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid
76 %36 = nvvm.read.ptx.sreg.nsmid : i32
77 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid
78 %37 = nvvm.read.ptx.sreg.gridid : i32
79 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg0
80 %38 = nvvm.read.ptx.sreg.envreg0 : i32
81 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg1
82 %39 = nvvm.read.ptx.sreg.envreg1 : i32
83 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg2
84 %40 = nvvm.read.ptx.sreg.envreg2 : i32
85 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg3
86 %41 = nvvm.read.ptx.sreg.envreg3 : i32
87 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg4
88 %42 = nvvm.read.ptx.sreg.envreg4 : i32
89 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg5
90 %43 = nvvm.read.ptx.sreg.envreg5 : i32
91 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg6
92 %44 = nvvm.read.ptx.sreg.envreg6 : i32
93 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg7
94 %45 = nvvm.read.ptx.sreg.envreg7 : i32
95 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg8
96 %46 = nvvm.read.ptx.sreg.envreg8 : i32
97 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg9
98 %47 = nvvm.read.ptx.sreg.envreg9 : i32
99 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg10
100 %48 = nvvm.read.ptx.sreg.envreg10 : i32
101 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg11
102 %49 = nvvm.read.ptx.sreg.envreg11 : i32
103 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg12
104 %50 = nvvm.read.ptx.sreg.envreg12 : i32
105 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg13
106 %51 = nvvm.read.ptx.sreg.envreg13 : i32
107 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg14
108 %52 = nvvm.read.ptx.sreg.envreg14 : i32
109 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg15
110 %53 = nvvm.read.ptx.sreg.envreg15 : i32
111 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg16
112 %54 = nvvm.read.ptx.sreg.envreg16 : i32
113 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg17
114 %55 = nvvm.read.ptx.sreg.envreg17 : i32
115 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg18
116 %56 = nvvm.read.ptx.sreg.envreg18 : i32
117 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg19
118 %57 = nvvm.read.ptx.sreg.envreg19 : i32
119 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg20
120 %58 = nvvm.read.ptx.sreg.envreg20 : i32
121 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg21
122 %59 = nvvm.read.ptx.sreg.envreg21 : i32
123 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg22
124 %60 = nvvm.read.ptx.sreg.envreg22 : i32
125 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg23
126 %61 = nvvm.read.ptx.sreg.envreg23 : i32
127 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg24
128 %62 = nvvm.read.ptx.sreg.envreg24 : i32
129 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg25
130 %63 = nvvm.read.ptx.sreg.envreg25 : i32
131 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg26
132 %64 = nvvm.read.ptx.sreg.envreg26 : i32
133 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg27
134 %65 = nvvm.read.ptx.sreg.envreg27 : i32
135 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg28
136 %66 = nvvm.read.ptx.sreg.envreg28 : i32
137 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg29
138 %67 = nvvm.read.ptx.sreg.envreg29 : i32
139 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg30
140 %68 = nvvm.read.ptx.sreg.envreg30 : i32
141 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg31
142 %69 = nvvm.read.ptx.sreg.envreg31 : i32
143 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq
144 %70 = nvvm.read.ptx.sreg.lanemask.eq : i32
145 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le
146 %71 = nvvm.read.ptx.sreg.lanemask.le : i32
147 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt
148 %72 = nvvm.read.ptx.sreg.lanemask.lt : i32
149 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge
150 %73 = nvvm.read.ptx.sreg.lanemask.ge : i32
151 //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt
152 %74 = nvvm.read.ptx.sreg.lanemask.gt : i32
156 // CHECK-LABEL: @nvvm_rcp
157 llvm.func @nvvm_rcp(%0: f32) -> f32 {
158 // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
159 %1 = nvvm.rcp.approx.ftz.f %0 : f32
163 // CHECK-LABEL: @llvm_nvvm_barrier0
164 llvm.func @llvm_nvvm_barrier0() {
165 // CHECK: call void @llvm.nvvm.barrier0()
170 // CHECK-LABEL: @llvm_nvvm_barrier(
171 // CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]])
172 llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) {
173 // CHECK: call void @llvm.nvvm.barrier0()
175 // CHECK: call void @llvm.nvvm.barrier.n(i32 %[[barId]])
176 nvvm.barrier id = %barID
177 // CHECK: call void @llvm.nvvm.barrier(i32 %[[barId]], i32 %[[numThreads]])
178 nvvm.barrier id = %barID number_of_threads = %numberOfThreads
182 // CHECK-LABEL: @llvm_nvvm_cluster_arrive
183 llvm.func @llvm_nvvm_cluster_arrive() {
184 // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
186 // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.aligned()
187 nvvm.cluster.arrive {aligned}
191 // CHECK-LABEL: @llvm_nvvm_cluster_arrive_relaxed
192 llvm.func @llvm_nvvm_cluster_arrive_relaxed() {
193 // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
194 nvvm.cluster.arrive.relaxed
195 // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed.aligned()
196 nvvm.cluster.arrive.relaxed {aligned}
200 // CHECK-LABEL: @llvm_nvvm_cluster_wait
201 llvm.func @llvm_nvvm_cluster_wait() {
202 // CHECK: call void @llvm.nvvm.barrier.cluster.wait()
204 // CHECK: call void @llvm.nvvm.barrier.cluster.wait.aligned()
205 nvvm.cluster.wait {aligned}
209 // CHECK-LABEL: @nvvm_shfl
210 llvm.func @nvvm_shfl(
211 %0 : i32, %1 : i32, %2 : i32,
212 %3 : i32, %4 : f32) -> i32 {
213 // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
214 %6 = nvvm.shfl.sync bfly %0, %3, %1, %2 : i32 -> i32
215 // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
216 %7 = nvvm.shfl.sync bfly %0, %4, %1, %2 : f32 -> f32
217 // CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
218 %8 = nvvm.shfl.sync up %0, %3, %1, %2 : i32 -> i32
219 // CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
220 %9 = nvvm.shfl.sync up %0, %4, %1, %2 : f32 -> f32
221 // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
222 %10 = nvvm.shfl.sync down %0, %3, %1, %2 : i32 -> i32
223 // CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
224 %11 = nvvm.shfl.sync down %0, %4, %1, %2 : f32 -> f32
225 // CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
226 %12 = nvvm.shfl.sync idx %0, %3, %1, %2 : i32 -> i32
227 // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
228 %13 = nvvm.shfl.sync idx %0, %4, %1, %2 : f32 -> f32
232 // CHECK-LABEL: @nvvm_shfl_pred
233 llvm.func @nvvm_shfl_pred(
234 %0 : i32, %1 : i32, %2 : i32,
235 %3 : i32, %4 : f32) -> !llvm.struct<(i32, i1)> {
236 // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.bfly.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
237 %6 = nvvm.shfl.sync bfly %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
238 // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.bfly.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
239 %7 = nvvm.shfl.sync bfly %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
240 // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.up.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
241 %8 = nvvm.shfl.sync up %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
242 // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.up.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
243 %9 = nvvm.shfl.sync up %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
244 // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.down.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
245 %10 = nvvm.shfl.sync down %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
246 // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.down.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
247 %11 = nvvm.shfl.sync down %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
248 // CHECK: call { i32, i1 } @llvm.nvvm.shfl.sync.idx.i32p(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
249 %12 = nvvm.shfl.sync idx %0, %3, %1, %2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i1)>
250 // CHECK: call { float, i1 } @llvm.nvvm.shfl.sync.idx.f32p(i32 %{{.*}}, float %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
251 %13 = nvvm.shfl.sync idx %0, %4, %1, %2 {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
252 llvm.return %6 : !llvm.struct<(i32, i1)>
255 // CHECK-LABEL: @nvvm_vote
256 llvm.func @nvvm_vote(%0 : i32, %1 : i1) -> i32 {
257 // CHECK: call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}})
258 %3 = nvvm.vote.ballot.sync %0, %1 : i32
262 // CHECK-LABEL: @nvvm_elect_sync
263 llvm.func @nvvm_elect_sync() -> i1 {
264 // CHECK: %[[RES:.*]] = call { i32, i1 } @llvm.nvvm.elect.sync(i32 -1)
265 // CHECK-NEXT: %[[PRED:.*]] = extractvalue { i32, i1 } %[[RES]], 1
266 // CHECK-NEXT: ret i1 %[[PRED]]
267 %0 = nvvm.elect.sync -> i1
271 // CHECK-LABEL: @nvvm_mma_mn8n8k4_row_col_f32_f32
272 llvm.func @nvvm_mma_mn8n8k4_row_col_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
273 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
274 %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32,
275 %c4 : f32, %c5 : f32, %c6 : f32, %c7 : f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> {
276 // CHECK: call { float, float, float, float, float, float, float, float } @llvm.nvvm.mma.m8n8k4.row.col.f32.f32
277 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0, %b1] C[%c0, %c1, %c2, %c3, %c4, %c5, %c6, %c7]
278 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, shape = #nvvm.shape<m = 8, n = 8, k = 4>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
279 llvm.return %0 : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
282 // CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f16
283 llvm.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
284 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
285 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
286 %c0 : vector<2xf16>, %c1 : vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)> {
287 // CHECK: call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f16
288 %0 = nvvm.mma.sync A[ %a0, %a1, %a2, %a3 ] B[ %b0, %b1 ] C[ %c0, %c1 ]
289 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, shape = #nvvm.shape<m = 16, n = 8, k = 16>}
290 : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
291 llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
294 // f32 return type, f16 accumulate type
295 // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
296 llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
297 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
298 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
299 %c0 : vector<2xf16>, %c1 : vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)> {
300 // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16
301 %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1]
302 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
303 shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)>
304 llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
307 // f16 return type, f32 accumulate type
308 // CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
309 llvm.func @nvvm_mma_m16n8k16_f16_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
310 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
311 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
312 %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)> {
313 // CHECK: call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32
314 %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
315 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
316 shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
317 llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
320 // f32 return type, f32 accumulate type
321 // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
322 llvm.func @nvvm_mma_m16n8k16_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
323 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
324 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
325 %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(f32, f32, f32, f32)> {
326 // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f32
327 %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
328 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
329 shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(f32, f32, f32, f32)>
330 llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
333 // CHECK-LABEL: @nvvm_mma_m16n8k16_s8_s8
334 llvm.func @nvvm_mma_m16n8k16_s8_s8(%a0 : i32, %a1 : i32,
336 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
337 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8
338 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
339 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
340 multiplicandAPtxType = #nvvm.mma_type<s8>, multiplicandBPtxType = #nvvm.mma_type<s8>,
341 intOverflowBehavior=#nvvm.mma_int_overflow<wrapped>,
342 shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (i32, i32, i32) -> !llvm.struct<(i32,i32,i32,i32)>
343 llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
346 // CHECK-LABEL: @nvvm_mma_m16n8k16_s8_u8
347 llvm.func @nvvm_mma_m16n8k16_s8_u8(%a0 : i32, %a1 : i32,
349 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
350 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8
351 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
352 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
353 multiplicandAPtxType = #nvvm.mma_type<s8>, multiplicandBPtxType = #nvvm.mma_type<u8>,
354 intOverflowBehavior=#nvvm.mma_int_overflow<satfinite>,
355 shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (i32, i32, i32) -> !llvm.struct<(i32,i32,i32,i32)>
356 llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
359 // CHECK-LABEL: @nvvm_mma_m16n8k128_b1_b1
360 llvm.func @nvvm_mma_m16n8k128_b1_b1(%a0 : i32, %a1 : i32,
362 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32,i32,i32,i32)> {
363 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1
364 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
365 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
366 multiplicandAPtxType = #nvvm.mma_type<b1>, multiplicandBPtxType = #nvvm.mma_type<b1>,
367 b1Op = #nvvm.mma_b1op<xor_popc>, shape = #nvvm.shape<m = 16, n = 8, k = 128>} : (i32, i32, i32) -> !llvm.struct<(i32,i32,i32,i32)>
368 llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
371 // CHECK-LABEL: @nvvm_mma_m16n8k32_s4_s4
372 llvm.func @nvvm_mma_m16n8k32_s4_s4(%a0 : i32, %a1 : i32,
374 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32,i32,i32,i32)> {
375 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k32.row.col.satfinite.s4
376 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
377 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
378 multiplicandAPtxType = #nvvm.mma_type<s4>, multiplicandBPtxType = #nvvm.mma_type<s4>,
379 intOverflowBehavior=#nvvm.mma_int_overflow<satfinite>,
380 shape = #nvvm.shape<m = 16, n = 8, k = 32>} : (i32, i32, i32) -> !llvm.struct<(i32,i32,i32,i32)>
381 llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
384 // CHECK-LABEL: @nvvm_mma_m8n8k4_f64_f64
385 llvm.func @nvvm_mma_m8n8k4_f64_f64(%a0 : f64,
387 %c0 : f64, %c1 : f64) -> !llvm.struct<(f64, f64)> {
388 // CHECK: call { double, double } @llvm.nvvm.mma.m8n8k4.row.col.f64
389 %0 = nvvm.mma.sync A[%a0] B[%b0] C[%c0, %c1]
390 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
391 shape = #nvvm.shape<m = 8, n = 8, k = 4>} : (f64, f64, f64) -> !llvm.struct<(f64, f64)>
392 llvm.return %0 : !llvm.struct<(f64, f64)>
395 // CHECK-LABEL: @nvvm_mma_m16n8k4_tf32_f32
396 llvm.func @nvvm_mma_m16n8k4_tf32_f32(%a0 : i32, %a1 : i32,
398 %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(f32, f32, f32, f32)> {
399 // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k4.row.col.tf32
400 %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
401 {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
402 multiplicandAPtxType = #nvvm.mma_type<tf32>, multiplicandBPtxType = #nvvm.mma_type<tf32>,
403 shape = #nvvm.shape<m = 16, n = 8, k = 4>} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)>
404 llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
407 // The test below checks the correct mapping of the nvvm.wmma.*.load.* op to the correct intrinsic
408 // in the LLVM NVPTX backend.
409 // CHECK-LABEL: @gpu_wmma_load_op
410 llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<3>, %arg1: i32) {
411 // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
412 %0 = nvvm.wmma.load %arg0, %arg1
413 {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
414 : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
419 // The test below checks the correct mapping of the nvvm.wmma.*.store.* op to the correct intrinsic
420 // in the LLVM NVPTX backend.
421 // CHECK-LABEL: @gpu_wmma_store_op
422 llvm.func @gpu_wmma_store_op(%arg0: !llvm.ptr<3>, %arg1: i32,
423 %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
424 %arg4: vector<2 xf16>, %arg5: vector<2 x f16>) {
425 // CHECK: call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
426 nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
427 {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
428 : !llvm.ptr<3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
432 // The test below checks the correct mapping of the nvvm.wmma.*.mma.* op to the correct intrinsic
433 // in the LLVM NVPTX backend.
434 // CHECK-LABEL: @gpu_wmma_mma_op
435 llvm.func @gpu_wmma_mma_op(%arg0: vector<2 x f16>, %arg1: vector<2 x f16>,
436 %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
437 %arg4: vector<2 x f16>, %arg5: vector<2 x f16>,
438 %arg6: vector<2 x f16>, %arg7: vector<2 x f16>,
439 %arg8: vector<2 x f16>, %arg9: vector<2 x f16>,
440 %arg10: vector<2 x f16>, %arg11: vector<2 x f16>,
441 %arg12: vector<2 x f16>, %arg13: vector<2 x f16>,
442 %arg14: vector<2 x f16>, %arg15: vector<2 x f16>,
443 %arg16: vector<2 x f16>, %arg17: vector<2 x f16>,
444 %arg18: vector<2 x f16>, %arg19: vector<2 x f16>) {
445 // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
446 %0 = nvvm.wmma.mma %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19
447 {eltypeA = #nvvm.mma_type<f16>, eltypeB = #nvvm.mma_type<f16>, k = 16 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
448 : (vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
449 vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
450 vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>,
451 vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)
452 -> !llvm.struct<(vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>)>
456 // CHECK-LABEL: @nvvm_wmma_load_tf32
457 llvm.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr, %arg1 : i32) {
458 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 %{{.*}})
459 %0 = nvvm.wmma.load %arg0, %arg1
460 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
461 : (!llvm.ptr) -> !llvm.struct<(i32, i32, i32, i32)>
465 // CHECK-LABEL: @nvvm_wmma_mma
466 llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 : i32,
467 %6 : i32, %7 : i32, %8 : f32, %9 : f32, %10 : f32,
468 %11 : f32, %12 : f32, %13 : f32, %14 : f32, %15 : f32) {
469 // CHECK: { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
470 %r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
471 {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
472 : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
473 -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
477 // CHECK-LABEL: @cp_async
478 llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
479 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
480 nvvm.cp.async.shared.global %arg0, %arg1, 4, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
481 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
482 nvvm.cp.async.shared.global %arg0, %arg1, 8, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
483 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
484 nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
485 // CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
486 nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
487 // CHECK: call void @llvm.nvvm.cp.async.commit.group()
488 nvvm.cp.async.commit.group
489 // CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)
490 nvvm.cp.async.wait.group 0
494 // CHECK-LABEL: @cp_async_mbarrier_arrive
495 llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
496 // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %{{.*}})
497 nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
498 // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
499 nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
500 // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
501 nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
502 // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
503 nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
507 // CHECK-LABEL: @llvm_nvvm_setmaxregister
508 llvm.func @llvm_nvvm_setmaxregister() {
509 // CHECK: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)
510 nvvm.setmaxregister increase 256
511 // CHECK: call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 24)
512 nvvm.setmaxregister decrease 24
516 // CHECK-LABEL: @llvm_nvvm_cp_async_bulk_commit_group
517 llvm.func @llvm_nvvm_cp_async_bulk_commit_group() {
518 // CHECK: call void @llvm.nvvm.cp.async.bulk.commit.group()
519 nvvm.cp.async.bulk.commit.group
523 // CHECK-LABEL: @llvm_nvvm_cp_async_bulk_wait_group
524 llvm.func @llvm_nvvm_cp_async_bulk_wait_group() {
525 // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group(i32 0)
526 nvvm.cp.async.bulk.wait_group 0
527 // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group(i32 3)
528 nvvm.cp.async.bulk.wait_group 3
529 // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 0)
530 nvvm.cp.async.bulk.wait_group 0 {read}
531 // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 3)
532 nvvm.cp.async.bulk.wait_group 3 {read}
536 // CHECK-LABEL: @ld_matrix
537 llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
538 // CHECK: call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %{{.*}})
539 %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> i32
540 // CHECK: call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %{{.*}})
541 %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
542 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %{{.*}})
543 %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
544 // CHECK: call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %{{.*}})
545 %l1t = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> i32
546 // CHECK: call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %{{.*}})
547 %l2t = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
548 // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %{{.*}})
549 %l4t = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
553 // This function has the "kernel" attribute attached and should appear in the
554 // NVVM annotations after conversion.
555 llvm.func @kernel_func() attributes {nvvm.kernel} {
559 // CHECK: !nvvm.annotations =
560 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
561 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
565 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 23, 32>} {
569 // CHECK: !nvvm.annotations =
570 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
571 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
572 // CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
573 // CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
574 // CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
577 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} {
581 // CHECK: !nvvm.annotations =
582 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
583 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
584 // CHECK: {ptr @kernel_func, !"reqntidx", i32 1}
585 // CHECK: {ptr @kernel_func, !"reqntidy", i32 23}
586 // CHECK: {ptr @kernel_func, !"reqntidz", i32 32}
589 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
593 // CHECK: !nvvm.annotations =
594 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
595 // CHECK: {ptr @kernel_func, !"cluster_dim_x", i32 3}
596 // CHECK: {ptr @kernel_func, !"cluster_dim_y", i32 5}
597 // CHECK: {ptr @kernel_func, !"cluster_dim_z", i32 7}
598 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
601 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
605 // CHECK: !nvvm.annotations =
606 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
607 // CHECK: {ptr @kernel_func, !"cluster_max_blocks", i32 8}
608 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
611 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = 16} {
615 // CHECK: !nvvm.annotations =
616 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
617 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
618 // CHECK: {ptr @kernel_func, !"minctasm", i32 16}
621 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxnreg = 16} {
625 // CHECK: !nvvm.annotations =
626 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
627 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
628 // CHECK: {ptr @kernel_func, !"maxnreg", i32 16}
631 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 23, 32>,
632 nvvm.minctasm = 16, nvvm.maxnreg = 32} {
636 // CHECK: !nvvm.annotations =
637 // CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
638 // CHECK: {ptr @kernel_func, !"kernel", i32 1}
639 // CHECK: {ptr @kernel_func, !"maxnreg", i32 32}
640 // CHECK: {ptr @kernel_func, !"maxntidx", i32 1}
641 // CHECK: {ptr @kernel_func, !"maxntidy", i32 23}
642 // CHECK: {ptr @kernel_func, !"maxntidz", i32 32}
643 // CHECK: {ptr @kernel_func, !"minctasm", i32 16}
646 // CHECK: !nvvm.annotations =
647 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
648 // CHECK: !2 = !{i32 1}
649 // CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
650 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
655 // CHECK: !nvvm.annotations =
656 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
657 // CHECK: !2 = !{i32 1, i32 3}
658 // CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
659 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
665 // CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
666 llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
667 %c128 = llvm.mlir.constant(128) : i32
668 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
669 nvvm.fence.proxy.release #nvvm.mem_scope<cta>
671 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
672 nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
674 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
675 nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
677 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
678 nvvm.fence.proxy.release #nvvm.mem_scope<sys>
683 // CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
684 llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
685 %c128 = llvm.mlir.constant(128) : i32
686 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
687 nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
689 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
690 nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
692 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
693 nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
695 // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
696 nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
701 // CHECK-LABEL: @nvvm_exit
702 llvm.func @nvvm_exit() {
703 // CHECK: call void @llvm.nvvm.exit()
711 // CHECK-LABEL: @nvvm_breakpoint
712 llvm.func @nvvm_breakpoint() {
713 // CHECK: call void @llvm.debugtrap()
719 // CHECK-LABEL: @nvvm_wgmma_fence_aligned
720 llvm.func @nvvm_wgmma_fence_aligned() {
721 // CHECK: call void @llvm.nvvm.wgmma.fence.sync.aligned()
722 nvvm.wgmma.fence.aligned
727 // CHECK-LABEL: @nvvm_wgmma_commit_group_aligned
728 llvm.func @nvvm_wgmma_commit_group_aligned() {
729 // CHECK: call void @llvm.nvvm.wgmma.commit_group.sync.aligned()
730 nvvm.wgmma.commit.group.sync.aligned
735 // CHECK-LABEL: @nvvm_wgmma_wait_group_aligned
736 llvm.func @nvvm_wgmma_wait_group_aligned() {
737 // CHECK: call void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 0)
738 nvvm.wgmma.wait.group.sync.aligned 0
739 // CHECK: call void @llvm.nvvm.wgmma.wait_group.sync.aligned(i64 20)
740 nvvm.wgmma.wait.group.sync.aligned 20