1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
5 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
8 ; GFX12-NEXT: s_clause 0x1
9 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
10 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
12 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
13 ; GFX12-NEXT: s_endpgm
15 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
16 store <8 x i32> %res, ptr addrspace(1) %out
20 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
21 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
22 ; GFX12: ; %bb.0: ; %bb
23 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
24 ; GFX12-NEXT: s_clause 0x1
25 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
26 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
28 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
29 ; GFX12-NEXT: s_endpgm
31 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
32 store <8 x i32> %res, ptr addrspace(1) %out
36 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
37 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
38 ; GFX12: ; %bb.0: ; %bb
39 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
40 ; GFX12-NEXT: s_clause 0x1
41 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
42 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
44 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
45 ; GFX12-NEXT: s_endpgm
47 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
48 store <8 x i32> %res, ptr addrspace(1) %out
54 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
55 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
56 ; GFX12: ; %bb.0: ; %bb
57 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
58 ; GFX12-NEXT: s_clause 0x1
59 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
60 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
62 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
63 ; GFX12-NEXT: s_endpgm
65 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
66 store <8 x i32> %res, ptr addrspace(1) %out
70 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
71 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
72 ; GFX12: ; %bb.0: ; %bb
73 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
74 ; GFX12-NEXT: s_clause 0x1
75 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
76 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
78 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
79 ; GFX12-NEXT: s_endpgm
81 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
82 store <8 x i32> %res, ptr addrspace(1) %out
86 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
87 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
88 ; GFX12: ; %bb.0: ; %bb
89 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
90 ; GFX12-NEXT: s_clause 0x1
91 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
92 ; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
94 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
95 ; GFX12-NEXT: s_endpgm
97 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
98 store <8 x i32> %res, ptr addrspace(1) %out
104 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
105 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
106 ; GFX12: ; %bb.0: ; %bb
107 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
108 ; GFX12-NEXT: s_clause 0x1
109 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
110 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
111 ; GFX12-NEXT: s_nop 0
112 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
113 ; GFX12-NEXT: s_endpgm
115 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
116 store <8 x i32> %res, ptr addrspace(1) %out
120 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
121 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
122 ; GFX12: ; %bb.0: ; %bb
123 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
124 ; GFX12-NEXT: s_clause 0x1
125 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
126 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
127 ; GFX12-NEXT: s_nop 0
128 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129 ; GFX12-NEXT: s_endpgm
131 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
132 store <8 x i32> %res, ptr addrspace(1) %out
136 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
137 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
138 ; GFX12: ; %bb.0: ; %bb
139 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
140 ; GFX12-NEXT: s_clause 0x1
141 ; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
142 ; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
143 ; GFX12-NEXT: s_nop 0
144 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
145 ; GFX12-NEXT: s_endpgm
147 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
148 store <8 x i32> %res, ptr addrspace(1) %out
156 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
157 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
158 ; GFX12: ; %bb.0: ; %bb
159 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
160 ; GFX12-NEXT: s_clause 0x1
161 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
162 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
163 ; GFX12-NEXT: s_nop 0
164 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
165 ; GFX12-NEXT: s_endpgm
167 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
168 store <8 x i32> %res, ptr addrspace(1) %out
172 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
173 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
174 ; GFX12: ; %bb.0: ; %bb
175 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
176 ; GFX12-NEXT: s_clause 0x1
177 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
178 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
179 ; GFX12-NEXT: s_nop 0
180 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
181 ; GFX12-NEXT: s_endpgm
183 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
184 store <8 x i32> %res, ptr addrspace(1) %out
188 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
189 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
190 ; GFX12: ; %bb.0: ; %bb
191 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
192 ; GFX12-NEXT: s_clause 0x1
193 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
194 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
195 ; GFX12-NEXT: s_nop 0
196 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
197 ; GFX12-NEXT: s_endpgm
199 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
200 store <8 x i32> %res, ptr addrspace(1) %out
206 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
207 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
208 ; GFX12: ; %bb.0: ; %bb
209 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
210 ; GFX12-NEXT: s_clause 0x1
211 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
212 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
213 ; GFX12-NEXT: s_nop 0
214 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
215 ; GFX12-NEXT: s_endpgm
217 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
218 store <8 x i32> %res, ptr addrspace(1) %out
222 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
223 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
224 ; GFX12: ; %bb.0: ; %bb
225 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
226 ; GFX12-NEXT: s_clause 0x1
227 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
228 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
229 ; GFX12-NEXT: s_nop 0
230 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
231 ; GFX12-NEXT: s_endpgm
233 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
234 store <8 x i32> %res, ptr addrspace(1) %out
238 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
239 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
240 ; GFX12: ; %bb.0: ; %bb
241 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
242 ; GFX12-NEXT: s_clause 0x1
243 ; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
244 ; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
245 ; GFX12-NEXT: s_nop 0
246 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
247 ; GFX12-NEXT: s_endpgm
249 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
250 store <8 x i32> %res, ptr addrspace(1) %out
256 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
257 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
258 ; GFX12: ; %bb.0: ; %bb
259 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
260 ; GFX12-NEXT: s_clause 0x1
261 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
262 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
263 ; GFX12-NEXT: s_nop 0
264 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
265 ; GFX12-NEXT: s_endpgm
267 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
268 store <8 x i32> %res, ptr addrspace(1) %out
272 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
273 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
274 ; GFX12: ; %bb.0: ; %bb
275 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
276 ; GFX12-NEXT: s_clause 0x1
277 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
278 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
279 ; GFX12-NEXT: s_nop 0
280 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
281 ; GFX12-NEXT: s_endpgm
283 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
284 store <8 x i32> %res, ptr addrspace(1) %out
288 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
289 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
290 ; GFX12: ; %bb.0: ; %bb
291 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
292 ; GFX12-NEXT: s_clause 0x1
293 ; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
294 ; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
295 ; GFX12-NEXT: s_nop 0
296 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
297 ; GFX12-NEXT: s_endpgm
299 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
300 store <8 x i32> %res, ptr addrspace(1) %out
304 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
305 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
306 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
307 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
308 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
309 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)