1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
4 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
5 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
6 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
7 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
8 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
9 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
11 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
13 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
14 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
16 ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
17 ; W32-NEXT: s_clause 0x1
18 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
19 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
21 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
24 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
25 store <8 x float> %res, ptr addrspace(1) %out, align 32
29 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
31 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
32 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
34 ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
35 ; W32-NEXT: s_clause 0x1
36 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
37 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
39 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
42 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
43 store <8 x float> %res, ptr addrspace(1) %out, align 32
47 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
49 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
50 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
52 ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
53 ; W32-NEXT: s_clause 0x1
54 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
55 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
57 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
60 %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
61 store <16 x half> %res, ptr addrspace(1) %out, align 32
65 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
66 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
68 ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
69 ; W32-NEXT: s_clause 0x1
70 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
71 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
73 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
76 %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
77 store <16 x half> %res, ptr addrspace(1) %out, align 32
81 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
83 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
84 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
86 ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
87 ; W32-NEXT: s_clause 0x1
88 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
89 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
91 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
94 %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
95 store <16 x i16> %res, ptr addrspace(1) %out, align 32
99 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
100 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
101 ; W32: ; %bb.0: ; %bb
102 ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
103 ; W32-NEXT: s_clause 0x1
104 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
105 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
107 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
110 %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
111 store <16 x i16> %res, ptr addrspace(1) %out, align 32
115 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
117 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
118 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
119 ; W32: ; %bb.0: ; %bb
120 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
121 ; W32-NEXT: s_clause 0x1
122 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
123 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
125 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
128 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
129 store <8 x i32> %res, ptr addrspace(1) %out, align 32
133 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
134 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
135 ; W32: ; %bb.0: ; %bb
136 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
137 ; W32-NEXT: s_clause 0x1
138 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
139 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
141 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
144 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
145 store <8 x i32> %res, ptr addrspace(1) %out, align 32
149 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
150 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
151 ; W32: ; %bb.0: ; %bb
152 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
153 ; W32-NEXT: s_clause 0x1
154 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
155 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
157 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
160 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
161 store <8 x i32> %res, ptr addrspace(1) %out, align 32
165 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
166 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
167 ; W32: ; %bb.0: ; %bb
168 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
169 ; W32-NEXT: s_clause 0x1
170 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
171 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
173 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
176 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
177 store <8 x i32> %res, ptr addrspace(1) %out, align 32
181 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
182 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
183 ; W32: ; %bb.0: ; %bb
184 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
185 ; W32-NEXT: s_clause 0x1
186 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
187 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
189 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
192 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
193 store <8 x i32> %res, ptr addrspace(1) %out, align 32
197 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
198 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
199 ; W32: ; %bb.0: ; %bb
200 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
201 ; W32-NEXT: s_clause 0x1
202 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
203 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
205 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
208 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
209 store <8 x i32> %res, ptr addrspace(1) %out, align 32
213 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
214 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
215 ; W32: ; %bb.0: ; %bb
216 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
217 ; W32-NEXT: s_clause 0x1
218 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
219 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
221 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
224 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
225 store <8 x i32> %res, ptr addrspace(1) %out, align 32
229 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
230 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
231 ; W32: ; %bb.0: ; %bb
232 ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
233 ; W32-NEXT: s_clause 0x1
234 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
235 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
237 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
240 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
241 store <8 x i32> %res, ptr addrspace(1) %out, align 32
245 ; @llvm.amdgcn.wmma.i32.16x16x16.iu4
247 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
248 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
249 ; W32: ; %bb.0: ; %bb
250 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
251 ; W32-NEXT: s_clause 0x1
252 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
253 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
255 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
258 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
259 store <8 x i32> %res, ptr addrspace(1) %out, align 32
263 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
264 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
265 ; W32: ; %bb.0: ; %bb
266 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
267 ; W32-NEXT: s_clause 0x1
268 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
269 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
271 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
274 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
275 store <8 x i32> %res, ptr addrspace(1) %out, align 32
279 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
280 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
281 ; W32: ; %bb.0: ; %bb
282 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
283 ; W32-NEXT: s_clause 0x1
284 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
285 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
287 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
290 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
291 store <8 x i32> %res, ptr addrspace(1) %out, align 32
295 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
296 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
297 ; W32: ; %bb.0: ; %bb
298 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
299 ; W32-NEXT: s_clause 0x1
300 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
301 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
303 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
306 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
307 store <8 x i32> %res, ptr addrspace(1) %out, align 32
312 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
313 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
314 ; W32: ; %bb.0: ; %bb
315 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
316 ; W32-NEXT: s_clause 0x1
317 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
318 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
320 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
323 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
324 store <8 x i32> %res, ptr addrspace(1) %out, align 32
328 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
329 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
330 ; W32: ; %bb.0: ; %bb
331 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
332 ; W32-NEXT: s_clause 0x1
333 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
334 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
336 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
339 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
340 store <8 x i32> %res, ptr addrspace(1) %out, align 32
344 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
345 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
346 ; W32: ; %bb.0: ; %bb
347 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
348 ; W32-NEXT: s_clause 0x1
349 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
350 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
352 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
355 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
356 store <8 x i32> %res, ptr addrspace(1) %out, align 32
360 define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
361 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
362 ; W32: ; %bb.0: ; %bb
363 ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
364 ; W32-NEXT: s_clause 0x1
365 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
366 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
368 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
371 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
372 store <8 x i32> %res, ptr addrspace(1) %out, align 32