1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
5 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
8 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
11 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
12 store <4 x i32> %res, ptr addrspace(1) %out
16 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
17 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
18 ; GFX12: ; %bb.0: ; %bb
19 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
20 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
21 ; GFX12-NEXT: s_endpgm
23 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
24 store <4 x i32> %res, ptr addrspace(1) %out
28 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
29 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
30 ; GFX12: ; %bb.0: ; %bb
31 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
32 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
33 ; GFX12-NEXT: s_endpgm
35 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
36 store <4 x i32> %res, ptr addrspace(1) %out
42 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
43 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
44 ; GFX12: ; %bb.0: ; %bb
45 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
46 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
47 ; GFX12-NEXT: s_endpgm
49 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
50 store <4 x i32> %res, ptr addrspace(1) %out
54 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
55 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
56 ; GFX12: ; %bb.0: ; %bb
57 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
58 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
59 ; GFX12-NEXT: s_endpgm
61 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
62 store <4 x i32> %res, ptr addrspace(1) %out
66 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
67 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
68 ; GFX12: ; %bb.0: ; %bb
69 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
70 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
71 ; GFX12-NEXT: s_endpgm
73 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
74 store <4 x i32> %res, ptr addrspace(1) %out
80 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
81 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
82 ; GFX12: ; %bb.0: ; %bb
83 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
84 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
85 ; GFX12-NEXT: s_endpgm
87 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
88 store <4 x i32> %res, ptr addrspace(1) %out
92 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
93 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
94 ; GFX12: ; %bb.0: ; %bb
95 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
96 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
97 ; GFX12-NEXT: s_endpgm
99 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
100 store <4 x i32> %res, ptr addrspace(1) %out
104 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
105 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
106 ; GFX12: ; %bb.0: ; %bb
107 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
108 ; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
109 ; GFX12-NEXT: s_endpgm
111 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
112 store <4 x i32> %res, ptr addrspace(1) %out
116 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
117 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
118 ; GFX12: ; %bb.0: ; %bb
119 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
120 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
121 ; GFX12-NEXT: s_endpgm
123 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
124 store <4 x i32> %res, ptr addrspace(1) %out
128 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
129 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
130 ; GFX12: ; %bb.0: ; %bb
131 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
132 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
133 ; GFX12-NEXT: s_endpgm
135 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
136 store <4 x i32> %res, ptr addrspace(1) %out
140 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
141 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
142 ; GFX12: ; %bb.0: ; %bb
143 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
144 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
145 ; GFX12-NEXT: s_endpgm
147 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
148 store <4 x i32> %res, ptr addrspace(1) %out
154 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
155 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
156 ; GFX12: ; %bb.0: ; %bb
157 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
158 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
159 ; GFX12-NEXT: s_endpgm
161 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
162 store <4 x i32> %res, ptr addrspace(1) %out
166 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
167 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
168 ; GFX12: ; %bb.0: ; %bb
169 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
170 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
171 ; GFX12-NEXT: s_endpgm
173 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
174 store <4 x i32> %res, ptr addrspace(1) %out
178 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
179 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
180 ; GFX12: ; %bb.0: ; %bb
181 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
182 ; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
183 ; GFX12-NEXT: s_endpgm
185 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
186 store <4 x i32> %res, ptr addrspace(1) %out
192 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
193 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
194 ; GFX12: ; %bb.0: ; %bb
195 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
196 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
197 ; GFX12-NEXT: s_endpgm
199 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
200 store <4 x i32> %res, ptr addrspace(1) %out
204 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
205 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
206 ; GFX12: ; %bb.0: ; %bb
207 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
208 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
209 ; GFX12-NEXT: s_endpgm
211 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
212 store <4 x i32> %res, ptr addrspace(1) %out
216 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
217 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
218 ; GFX12: ; %bb.0: ; %bb
219 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
220 ; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
221 ; GFX12-NEXT: s_endpgm
223 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
224 store <4 x i32> %res, ptr addrspace(1) %out
228 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
229 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
230 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
231 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
232 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
233 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)