1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
8 ; GFX12-NEXT: s_clause 0x1
9 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
10 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
12 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
13 ; GFX12-NEXT: s_endpgm
15 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
16 store <8 x float> %res, ptr addrspace(1) %out
20 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
21 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
22 ; GFX12: ; %bb.0: ; %bb
23 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
24 ; GFX12-NEXT: s_clause 0x1
25 ; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
26 ; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
28 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
29 ; GFX12-NEXT: s_endpgm
31 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
32 store <8 x float> %res, ptr addrspace(1) %out
36 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
37 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
38 ; GFX12: ; %bb.0: ; %bb
39 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
40 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
42 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
43 ; GFX12-NEXT: s_endpgm
45 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
46 store <8 x half> %res, ptr addrspace(1) %out
50 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
51 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
52 ; GFX12: ; %bb.0: ; %bb
53 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
54 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
56 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
57 ; GFX12-NEXT: s_endpgm
59 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
60 store <8 x i16> %res, ptr addrspace(1) %out
64 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
65 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
66 ; GFX12: ; %bb.0: ; %bb
67 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
68 ; GFX12-NEXT: s_clause 0x1
69 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
70 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
72 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
73 ; GFX12-NEXT: s_endpgm
75 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
76 store <8 x i32> %res, ptr addrspace(1) %out
80 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
81 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
82 ; GFX12: ; %bb.0: ; %bb
83 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
84 ; GFX12-NEXT: s_clause 0x1
85 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
86 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
88 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
89 ; GFX12-NEXT: s_endpgm
91 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
92 store <8 x i32> %res, ptr addrspace(1) %out
96 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
97 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
98 ; GFX12: ; %bb.0: ; %bb
99 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
100 ; GFX12-NEXT: s_clause 0x1
101 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
102 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
103 ; GFX12-NEXT: s_nop 0
104 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
105 ; GFX12-NEXT: s_endpgm
107 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
108 store <8 x float> %res, ptr addrspace(1) %out
112 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
113 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
114 ; GFX12: ; %bb.0: ; %bb
115 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
116 ; GFX12-NEXT: s_clause 0x1
117 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
118 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
119 ; GFX12-NEXT: s_nop 0
120 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
121 ; GFX12-NEXT: s_endpgm
123 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
124 store <8 x float> %res, ptr addrspace(1) %out
128 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
129 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
130 ; GFX12: ; %bb.0: ; %bb
131 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
132 ; GFX12-NEXT: s_clause 0x1
133 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
134 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
135 ; GFX12-NEXT: s_nop 0
136 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
137 ; GFX12-NEXT: s_endpgm
139 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
140 store <8 x float> %res, ptr addrspace(1) %out
144 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
145 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
146 ; GFX12: ; %bb.0: ; %bb
147 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
148 ; GFX12-NEXT: s_clause 0x1
149 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
150 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
151 ; GFX12-NEXT: s_nop 0
152 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
153 ; GFX12-NEXT: s_endpgm
155 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
156 store <8 x float> %res, ptr addrspace(1) %out
160 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
161 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
162 ; GFX12: ; %bb.0: ; %bb
163 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
164 ; GFX12-NEXT: s_clause 0x1
165 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
166 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
167 ; GFX12-NEXT: s_nop 0
168 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
169 ; GFX12-NEXT: s_endpgm
171 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
172 store <8 x i32> %res, ptr addrspace(1) %out
177 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
178 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
179 ; GFX12: ; %bb.0: ; %bb
180 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
181 ; GFX12-NEXT: s_clause 0x1
182 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
183 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
184 ; GFX12-NEXT: s_nop 0
185 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
186 ; GFX12-NEXT: s_endpgm
188 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
189 store <8 x float> %res, ptr addrspace(1) %out
193 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
194 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
195 ; GFX12: ; %bb.0: ; %bb
196 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
197 ; GFX12-NEXT: s_clause 0x1
198 ; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
199 ; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
200 ; GFX12-NEXT: s_nop 0
201 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
202 ; GFX12-NEXT: s_endpgm
204 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
205 store <8 x float> %res, ptr addrspace(1) %out
209 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
210 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
211 ; GFX12: ; %bb.0: ; %bb
212 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
213 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
214 ; GFX12-NEXT: s_nop 0
215 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
216 ; GFX12-NEXT: s_endpgm
218 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
219 store <8 x half> %res, ptr addrspace(1) %out
223 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
224 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
225 ; GFX12: ; %bb.0: ; %bb
226 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
227 ; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
228 ; GFX12-NEXT: s_nop 0
229 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
230 ; GFX12-NEXT: s_endpgm
232 %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
233 store <8 x i16> %res, ptr addrspace(1) %out
237 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
238 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
239 ; GFX12: ; %bb.0: ; %bb
240 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
241 ; GFX12-NEXT: s_clause 0x1
242 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
243 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
244 ; GFX12-NEXT: s_nop 0
245 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
246 ; GFX12-NEXT: s_endpgm
248 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
249 store <8 x i32> %res, ptr addrspace(1) %out
253 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
254 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
255 ; GFX12: ; %bb.0: ; %bb
256 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
257 ; GFX12-NEXT: s_clause 0x1
258 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
259 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
260 ; GFX12-NEXT: s_nop 0
261 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
262 ; GFX12-NEXT: s_endpgm
264 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
265 store <8 x i32> %res, ptr addrspace(1) %out
269 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
270 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
271 ; GFX12: ; %bb.0: ; %bb
272 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
273 ; GFX12-NEXT: s_clause 0x1
274 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
275 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
276 ; GFX12-NEXT: s_nop 0
277 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
278 ; GFX12-NEXT: s_endpgm
280 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
281 store <8 x i32> %res, ptr addrspace(1) %out
285 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
286 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
287 ; GFX12: ; %bb.0: ; %bb
288 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
289 ; GFX12-NEXT: s_clause 0x1
290 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
291 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
292 ; GFX12-NEXT: s_nop 0
293 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
294 ; GFX12-NEXT: s_endpgm
296 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
297 store <8 x float> %res, ptr addrspace(1) %out
301 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
302 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
303 ; GFX12: ; %bb.0: ; %bb
304 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
305 ; GFX12-NEXT: s_clause 0x1
306 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
307 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
308 ; GFX12-NEXT: s_nop 0
309 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
310 ; GFX12-NEXT: s_endpgm
312 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
313 store <8 x float> %res, ptr addrspace(1) %out
317 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
318 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
319 ; GFX12: ; %bb.0: ; %bb
320 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
321 ; GFX12-NEXT: s_clause 0x1
322 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
323 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
324 ; GFX12-NEXT: s_nop 0
325 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
326 ; GFX12-NEXT: s_endpgm
328 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
329 store <8 x float> %res, ptr addrspace(1) %out
333 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
334 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
335 ; GFX12: ; %bb.0: ; %bb
336 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
337 ; GFX12-NEXT: s_clause 0x1
338 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
339 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
340 ; GFX12-NEXT: s_nop 0
341 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
342 ; GFX12-NEXT: s_endpgm
344 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
345 store <8 x float> %res, ptr addrspace(1) %out
349 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
350 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
351 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
352 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
353 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
354 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
355 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
356 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
357 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
358 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
359 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
360 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
361 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
362 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
363 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
364 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
365 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
366 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
367 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
368 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
369 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
370 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)