1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
8 ; GFX12-NEXT: s_clause 0x1
9 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
10 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
12 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
13 ; GFX12-NEXT: s_endpgm
15 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
16 store <8 x float> %res, ptr addrspace(1) %out
20 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
21 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
22 ; GFX12: ; %bb.0: ; %bb
23 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
24 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
25 ; GFX12-NEXT: s_mov_b32 s7, s0
26 ; GFX12-NEXT: s_mov_b32 s1, s0
27 ; GFX12-NEXT: s_mov_b32 s2, s0
28 ; GFX12-NEXT: s_mov_b32 s3, s0
29 ; GFX12-NEXT: s_mov_b32 s4, s0
30 ; GFX12-NEXT: s_mov_b32 s5, s0
31 ; GFX12-NEXT: s_mov_b32 s6, s0
32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
33 ; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
34 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
35 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
36 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
37 ; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
38 ; GFX12-NEXT: s_clause 0x1
39 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
40 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
42 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
43 ; GFX12-NEXT: s_endpgm
45 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
46 store <8 x float> %res, ptr addrspace(1) %out
50 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
51 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
52 ; GFX12: ; %bb.0: ; %bb
53 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
54 ; GFX12-NEXT: s_clause 0x1
55 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
56 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
58 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59 ; GFX12-NEXT: s_endpgm
61 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
62 store <8 x float> %res, ptr addrspace(1) %out
66 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
67 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
68 ; GFX12: ; %bb.0: ; %bb
69 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
70 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
71 ; GFX12-NEXT: s_mov_b32 s7, s0
72 ; GFX12-NEXT: s_mov_b32 s1, s0
73 ; GFX12-NEXT: s_mov_b32 s2, s0
74 ; GFX12-NEXT: s_mov_b32 s3, s0
75 ; GFX12-NEXT: s_mov_b32 s4, s0
76 ; GFX12-NEXT: s_mov_b32 s5, s0
77 ; GFX12-NEXT: s_mov_b32 s6, s0
78 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
79 ; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
80 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
81 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
82 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
83 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
84 ; GFX12-NEXT: s_clause 0x1
85 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
86 ; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
88 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
89 ; GFX12-NEXT: s_endpgm
91 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
92 store <8 x float> %res, ptr addrspace(1) %out
96 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
97 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
98 ; GFX12: ; %bb.0: ; %bb
99 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
100 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
101 ; GFX12-NEXT: s_nop 0
102 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
103 ; GFX12-NEXT: s_endpgm
105 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
106 store <8 x half> %res, ptr addrspace(1) %out
110 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
111 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
112 ; GFX12: ; %bb.0: ; %bb
113 ; GFX12-NEXT: s_mov_b32 s0, 0x42004200
114 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
115 ; GFX12-NEXT: s_mov_b32 s3, s0
116 ; GFX12-NEXT: s_mov_b32 s1, s0
117 ; GFX12-NEXT: s_mov_b32 s2, s0
118 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
119 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
120 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
121 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
122 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
123 ; GFX12-NEXT: s_nop 0
124 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
125 ; GFX12-NEXT: s_endpgm
127 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
128 store <8 x half> %res, ptr addrspace(1) %out
132 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
133 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
134 ; GFX12: ; %bb.0: ; %bb
135 ; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
136 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
137 ; GFX12-NEXT: s_mov_b32 s3, s0
138 ; GFX12-NEXT: s_mov_b32 s1, s0
139 ; GFX12-NEXT: s_mov_b32 s2, s0
140 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
141 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
142 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
143 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
144 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
145 ; GFX12-NEXT: s_nop 0
146 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
147 ; GFX12-NEXT: s_endpgm
149 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
150 store <8 x i16> %res, ptr addrspace(1) %out
154 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
155 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
156 ; GFX12: ; %bb.0: ; %bb
157 ; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
158 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
159 ; GFX12-NEXT: s_mov_b32 s3, s0
160 ; GFX12-NEXT: s_mov_b32 s1, s0
161 ; GFX12-NEXT: s_mov_b32 s2, s0
162 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
163 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
164 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
165 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
166 ; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
167 ; GFX12-NEXT: s_nop 0
168 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
169 ; GFX12-NEXT: s_endpgm
171 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
172 store <8 x i16> %res, ptr addrspace(1) %out
176 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
177 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
178 ; GFX12: ; %bb.0: ; %bb
179 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
180 ; GFX12-NEXT: s_clause 0x1
181 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
182 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
183 ; GFX12-NEXT: s_nop 0
184 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
185 ; GFX12-NEXT: s_endpgm
187 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
188 store <8 x i32> %res, ptr addrspace(1) %out
192 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
193 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
194 ; GFX12: ; %bb.0: ; %bb
195 ; GFX12-NEXT: s_movk_i32 s0, 0x80
196 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
197 ; GFX12-NEXT: s_mov_b32 s7, s0
198 ; GFX12-NEXT: s_mov_b32 s1, s0
199 ; GFX12-NEXT: s_mov_b32 s2, s0
200 ; GFX12-NEXT: s_mov_b32 s3, s0
201 ; GFX12-NEXT: s_mov_b32 s4, s0
202 ; GFX12-NEXT: s_mov_b32 s5, s0
203 ; GFX12-NEXT: s_mov_b32 s6, s0
204 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
205 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
206 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
207 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
208 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
209 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
210 ; GFX12-NEXT: s_clause 0x1
211 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
212 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
213 ; GFX12-NEXT: s_nop 0
214 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
215 ; GFX12-NEXT: s_endpgm
217 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
218 store <8 x i32> %res, ptr addrspace(1) %out
222 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
223 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
224 ; GFX12: ; %bb.0: ; %bb
225 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
226 ; GFX12-NEXT: s_clause 0x1
227 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
228 ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
229 ; GFX12-NEXT: s_nop 0
230 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
231 ; GFX12-NEXT: s_endpgm
233 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
234 store <8 x i32> %res, ptr addrspace(1) %out
238 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
239 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
240 ; GFX12: ; %bb.0: ; %bb
241 ; GFX12-NEXT: s_movk_i32 s0, 0x80
242 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
243 ; GFX12-NEXT: s_mov_b32 s7, s0
244 ; GFX12-NEXT: s_mov_b32 s1, s0
245 ; GFX12-NEXT: s_mov_b32 s2, s0
246 ; GFX12-NEXT: s_mov_b32 s3, s0
247 ; GFX12-NEXT: s_mov_b32 s4, s0
248 ; GFX12-NEXT: s_mov_b32 s5, s0
249 ; GFX12-NEXT: s_mov_b32 s6, s0
250 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
251 ; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
252 ; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
253 ; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
254 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
255 ; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
256 ; GFX12-NEXT: s_clause 0x1
257 ; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
258 ; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
259 ; GFX12-NEXT: s_nop 0
260 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
261 ; GFX12-NEXT: s_endpgm
263 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
264 store <8 x i32> %res, ptr addrspace(1) %out
268 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
269 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
270 ; GFX12: ; %bb.0: ; %bb
271 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
272 ; GFX12-NEXT: s_clause 0x1
273 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
274 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
275 ; GFX12-NEXT: s_nop 0
276 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
277 ; GFX12-NEXT: s_endpgm
279 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
280 store <8 x float> %res, ptr addrspace(1) %out
284 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
285 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
286 ; GFX12: ; %bb.0: ; %bb
287 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
288 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
289 ; GFX12-NEXT: s_mov_b32 s7, s0
290 ; GFX12-NEXT: s_mov_b32 s1, s0
291 ; GFX12-NEXT: s_mov_b32 s2, s0
292 ; GFX12-NEXT: s_mov_b32 s3, s0
293 ; GFX12-NEXT: s_mov_b32 s4, s0
294 ; GFX12-NEXT: s_mov_b32 s5, s0
295 ; GFX12-NEXT: s_mov_b32 s6, s0
296 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
297 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
298 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
299 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
300 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
301 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
302 ; GFX12-NEXT: s_clause 0x1
303 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
304 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
305 ; GFX12-NEXT: s_nop 0
306 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
307 ; GFX12-NEXT: s_endpgm
309 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
310 store <8 x float> %res, ptr addrspace(1) %out
314 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
315 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
316 ; GFX12: ; %bb.0: ; %bb
317 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
318 ; GFX12-NEXT: s_clause 0x1
319 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
320 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
321 ; GFX12-NEXT: s_nop 0
322 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
323 ; GFX12-NEXT: s_endpgm
325 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
326 store <8 x float> %res, ptr addrspace(1) %out
330 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
331 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
332 ; GFX12: ; %bb.0: ; %bb
333 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
334 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
335 ; GFX12-NEXT: s_mov_b32 s7, s0
336 ; GFX12-NEXT: s_mov_b32 s1, s0
337 ; GFX12-NEXT: s_mov_b32 s2, s0
338 ; GFX12-NEXT: s_mov_b32 s3, s0
339 ; GFX12-NEXT: s_mov_b32 s4, s0
340 ; GFX12-NEXT: s_mov_b32 s5, s0
341 ; GFX12-NEXT: s_mov_b32 s6, s0
342 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
343 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
344 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
345 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
346 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
347 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
348 ; GFX12-NEXT: s_clause 0x1
349 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
350 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
351 ; GFX12-NEXT: s_nop 0
352 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
353 ; GFX12-NEXT: s_endpgm
355 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
356 store <8 x float> %res, ptr addrspace(1) %out
360 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
361 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
362 ; GFX12: ; %bb.0: ; %bb
363 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
364 ; GFX12-NEXT: s_clause 0x1
365 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
366 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
367 ; GFX12-NEXT: s_nop 0
368 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
369 ; GFX12-NEXT: s_endpgm
371 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
372 store <8 x float> %res, ptr addrspace(1) %out
376 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
377 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
378 ; GFX12: ; %bb.0: ; %bb
379 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
380 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
381 ; GFX12-NEXT: s_mov_b32 s7, s0
382 ; GFX12-NEXT: s_mov_b32 s1, s0
383 ; GFX12-NEXT: s_mov_b32 s2, s0
384 ; GFX12-NEXT: s_mov_b32 s3, s0
385 ; GFX12-NEXT: s_mov_b32 s4, s0
386 ; GFX12-NEXT: s_mov_b32 s5, s0
387 ; GFX12-NEXT: s_mov_b32 s6, s0
388 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
389 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
390 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
391 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
392 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
393 ; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
394 ; GFX12-NEXT: s_clause 0x1
395 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
396 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
397 ; GFX12-NEXT: s_nop 0
398 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399 ; GFX12-NEXT: s_endpgm
401 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
402 store <8 x float> %res, ptr addrspace(1) %out
406 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
407 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
408 ; GFX12: ; %bb.0: ; %bb
409 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
410 ; GFX12-NEXT: s_clause 0x1
411 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
412 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
413 ; GFX12-NEXT: s_nop 0
414 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
415 ; GFX12-NEXT: s_endpgm
417 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
418 store <8 x float> %res, ptr addrspace(1) %out
422 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
423 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
424 ; GFX12: ; %bb.0: ; %bb
425 ; GFX12-NEXT: s_mov_b32 s0, 0x40400000
426 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
427 ; GFX12-NEXT: s_mov_b32 s7, s0
428 ; GFX12-NEXT: s_mov_b32 s1, s0
429 ; GFX12-NEXT: s_mov_b32 s2, s0
430 ; GFX12-NEXT: s_mov_b32 s3, s0
431 ; GFX12-NEXT: s_mov_b32 s4, s0
432 ; GFX12-NEXT: s_mov_b32 s5, s0
433 ; GFX12-NEXT: s_mov_b32 s6, s0
434 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
435 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
436 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
437 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
438 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
439 ; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
440 ; GFX12-NEXT: s_clause 0x1
441 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
442 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
443 ; GFX12-NEXT: s_nop 0
444 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
445 ; GFX12-NEXT: s_endpgm
447 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
448 store <8 x float> %res, ptr addrspace(1) %out
452 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
453 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
454 ; GFX12: ; %bb.0: ; %bb
455 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
456 ; GFX12-NEXT: s_clause 0x1
457 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
458 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
459 ; GFX12-NEXT: s_nop 0
460 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
461 ; GFX12-NEXT: s_endpgm
463 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
464 store <8 x i32> %res, ptr addrspace(1) %out
468 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
469 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
470 ; GFX12: ; %bb.0: ; %bb
471 ; GFX12-NEXT: s_movk_i32 s0, 0x80
472 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
473 ; GFX12-NEXT: s_mov_b32 s7, s0
474 ; GFX12-NEXT: s_mov_b32 s1, s0
475 ; GFX12-NEXT: s_mov_b32 s2, s0
476 ; GFX12-NEXT: s_mov_b32 s3, s0
477 ; GFX12-NEXT: s_mov_b32 s4, s0
478 ; GFX12-NEXT: s_mov_b32 s5, s0
479 ; GFX12-NEXT: s_mov_b32 s6, s0
480 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
481 ; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
482 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
483 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
484 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
485 ; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
486 ; GFX12-NEXT: s_clause 0x1
487 ; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
488 ; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
489 ; GFX12-NEXT: s_nop 0
490 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
491 ; GFX12-NEXT: s_endpgm
493 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
494 store <8 x i32> %res, ptr addrspace(1) %out
498 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
499 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
500 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
501 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
502 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
503 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
504 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
505 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
506 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
507 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
508 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
509 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
510 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
511 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
512 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
513 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
514 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
515 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
516 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
517 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
518 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
519 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)