1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
5 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
8 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
10 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11 ; GFX12-NEXT: s_endpgm
13 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
14 store <4 x i32> %res, ptr addrspace(1) %out
18 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
19 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
20 ; GFX12: ; %bb.0: ; %bb
21 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
22 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
24 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
25 ; GFX12-NEXT: s_endpgm
27 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
28 store <4 x i32> %res, ptr addrspace(1) %out
32 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
33 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
34 ; GFX12: ; %bb.0: ; %bb
35 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
36 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
38 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
39 ; GFX12-NEXT: s_endpgm
41 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
42 store <4 x i32> %res, ptr addrspace(1) %out
48 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
49 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
50 ; GFX12: ; %bb.0: ; %bb
51 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
52 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
54 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
55 ; GFX12-NEXT: s_endpgm
57 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
58 store <4 x i32> %res, ptr addrspace(1) %out
62 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
63 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
64 ; GFX12: ; %bb.0: ; %bb
65 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
66 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
68 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
69 ; GFX12-NEXT: s_endpgm
71 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
72 store <4 x i32> %res, ptr addrspace(1) %out
76 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
77 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
78 ; GFX12: ; %bb.0: ; %bb
79 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
80 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
82 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
83 ; GFX12-NEXT: s_endpgm
85 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
86 store <4 x i32> %res, ptr addrspace(1) %out
92 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
93 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
94 ; GFX12: ; %bb.0: ; %bb
95 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
96 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
98 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
99 ; GFX12-NEXT: s_endpgm
101 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
102 store <4 x i32> %res, ptr addrspace(1) %out
106 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
107 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
108 ; GFX12: ; %bb.0: ; %bb
109 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
110 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
111 ; GFX12-NEXT: s_nop 0
112 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
113 ; GFX12-NEXT: s_endpgm
115 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
116 store <4 x i32> %res, ptr addrspace(1) %out
120 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
121 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
122 ; GFX12: ; %bb.0: ; %bb
123 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
124 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
125 ; GFX12-NEXT: s_nop 0
126 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
127 ; GFX12-NEXT: s_endpgm
129 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
130 store <4 x i32> %res, ptr addrspace(1) %out
139 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
140 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
141 ; GFX12: ; %bb.0: ; %bb
142 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
143 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
144 ; GFX12-NEXT: s_nop 0
145 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
146 ; GFX12-NEXT: s_endpgm
148 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
149 store <4 x i32> %res, ptr addrspace(1) %out
153 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
154 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
155 ; GFX12: ; %bb.0: ; %bb
156 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
157 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
158 ; GFX12-NEXT: s_nop 0
159 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
160 ; GFX12-NEXT: s_endpgm
162 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
163 store <4 x i32> %res, ptr addrspace(1) %out
167 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
168 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
169 ; GFX12: ; %bb.0: ; %bb
170 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
171 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
172 ; GFX12-NEXT: s_nop 0
173 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
174 ; GFX12-NEXT: s_endpgm
176 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
177 store <4 x i32> %res, ptr addrspace(1) %out
183 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
184 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
185 ; GFX12: ; %bb.0: ; %bb
186 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
187 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
188 ; GFX12-NEXT: s_nop 0
189 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
190 ; GFX12-NEXT: s_endpgm
192 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
193 store <4 x i32> %res, ptr addrspace(1) %out
197 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
198 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
199 ; GFX12: ; %bb.0: ; %bb
200 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
201 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
202 ; GFX12-NEXT: s_nop 0
203 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
204 ; GFX12-NEXT: s_endpgm
206 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
207 store <4 x i32> %res, ptr addrspace(1) %out
211 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
212 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
213 ; GFX12: ; %bb.0: ; %bb
214 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
215 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
216 ; GFX12-NEXT: s_nop 0
217 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
218 ; GFX12-NEXT: s_endpgm
220 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
221 store <4 x i32> %res, ptr addrspace(1) %out
227 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
228 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
229 ; GFX12: ; %bb.0: ; %bb
230 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
231 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
232 ; GFX12-NEXT: s_nop 0
233 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
234 ; GFX12-NEXT: s_endpgm
236 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
237 store <4 x i32> %res, ptr addrspace(1) %out
241 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
242 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
243 ; GFX12: ; %bb.0: ; %bb
244 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
245 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
246 ; GFX12-NEXT: s_nop 0
247 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
248 ; GFX12-NEXT: s_endpgm
250 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
251 store <4 x i32> %res, ptr addrspace(1) %out
255 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
256 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
257 ; GFX12: ; %bb.0: ; %bb
258 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
259 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
260 ; GFX12-NEXT: s_nop 0
261 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
262 ; GFX12-NEXT: s_endpgm
264 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
265 store <4 x i32> %res, ptr addrspace(1) %out
269 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
270 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
271 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
272 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
273 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
274 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)