1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s
11 ; --------------------------------------------------------------------
12 ; Different format signatures
13 ; --------------------------------------------------------------------
16 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
17 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
19 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
21 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
22 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
23 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
25 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
28 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
29 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
30 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
31 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
32 ; GCN-NEXT: s_setpc_b64 s[30:31]
33 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
36 i32 0, i32 %scale0, i32 0, i32 %scale1)
37 ret <4 x float> %result
40 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
41 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
43 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
45 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
46 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
47 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
49 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
52 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
53 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
54 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
55 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
56 ; GCN-NEXT: s_setpc_b64 s[30:31]
57 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
60 i32 1, i32 %scale0, i32 1, i32 %scale1)
61 ret <4 x float> %result
64 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
65 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
67 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
69 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
70 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
71 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
73 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
76 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
77 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
78 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
79 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
80 ; GCN-NEXT: s_setpc_b64 s[30:31]
81 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
84 i32 2, i32 %scale0, i32 2, i32 %scale1)
85 ret <4 x float> %result
88 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
89 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
91 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
93 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
94 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
95 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
97 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
100 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
101 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
102 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
103 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
104 ; GCN-NEXT: s_setpc_b64 s[30:31]
105 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
108 i32 3, i32 %scale0, i32 3, i32 %scale1)
109 ret <4 x float> %result
112 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
113 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
115 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
117 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
118 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
119 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
121 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
124 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
125 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
126 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
127 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
128 ; GCN-NEXT: s_setpc_b64 s[30:31]
129 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
132 i32 0, i32 %scale0, i32 3, i32 %scale1)
133 ret <4 x float> %result
136 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
137 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
139 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
141 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
142 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
143 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
145 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
148 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
149 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
150 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
151 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
152 ; GCN-NEXT: s_setpc_b64 s[30:31]
153 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
156 i32 3, i32 %scale0, i32 0, i32 %scale1)
157 ret <4 x float> %result
160 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
161 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
163 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
165 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
166 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
167 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
169 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
172 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
173 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
174 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
175 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
176 ; GCN-NEXT: s_setpc_b64 s[30:31]
177 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
180 i32 2, i32 %scale0, i32 3, i32 %scale1)
181 ret <4 x float> %result
184 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
185 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
187 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
189 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
190 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
191 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
193 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
196 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
197 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
198 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
199 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
200 ; GCN-NEXT: s_setpc_b64 s[30:31]
201 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
204 i32 3, i32 %scale0, i32 2, i32 %scale1)
205 ret <4 x float> %result
208 ; This should be optimized to avoid the scale
209 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
210 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
212 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
214 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
215 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
216 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
218 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
221 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
222 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
223 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
224 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
225 ; GCN-NEXT: s_setpc_b64 s[30:31]
226 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
229 i32 0, i32 0, i32 0, i32 0)
230 ret <4 x float> %result
234 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
235 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
237 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
239 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
240 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
241 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
243 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
246 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
247 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
248 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
249 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
250 ; GCN-NEXT: s_setpc_b64 s[30:31]
251 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
254 i32 0, i32 %scale0, i32 0, i32 %scale1)
255 ret <4 x float> %result
258 ; This should be optimized to avoid the scale
259 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
260 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
262 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
264 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
265 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
266 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
268 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
271 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
272 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
273 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
274 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
275 ; GCN-NEXT: s_setpc_b64 s[30:31]
276 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
279 i32 0, i32 0, i32 0, i32 0)
280 ret <4 x float> %result
284 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
285 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
287 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
289 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
290 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
291 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
293 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
296 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
297 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
298 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
299 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
300 ; GCN-NEXT: s_setpc_b64 s[30:31]
301 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
304 i32 0, i32 %scale0, i32 0, i32 %scale1)
305 ret <4 x float> %result
308 ; This should be optimized to avoid the scale
309 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
310 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
312 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
314 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
315 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
316 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
318 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
321 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
322 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
323 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
324 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
325 ; GCN-NEXT: s_setpc_b64 s[30:31]
326 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
329 i32 0, i32 0, i32 0, i32 0)
330 ret <4 x float> %result
334 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
335 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
337 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
339 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
340 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
341 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
343 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
346 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
347 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
348 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
349 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
350 ; GCN-NEXT: s_setpc_b64 s[30:31]
351 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
354 i32 0, i32 %scale0, i32 0, i32 %scale1)
355 ret <4 x float> %result
358 ; This should be optimized to avoid the scale
359 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
360 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
362 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
364 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
365 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
366 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
368 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
371 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
372 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
373 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
374 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
375 ; GCN-NEXT: s_setpc_b64 s[30:31]
376 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
379 i32 0, i32 0, i32 0, i32 0)
380 ret <4 x float> %result
384 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
385 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
387 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
389 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
390 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
391 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
393 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
396 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
397 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
398 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
399 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
400 ; GCN-NEXT: s_setpc_b64 s[30:31]
401 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
404 i32 0, i32 %scale0, i32 0, i32 %scale1)
405 ret <4 x float> %result
408 ; This should be optimized to avoid the scale
409 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
410 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
412 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
414 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
415 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
416 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
418 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
421 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
422 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
423 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
424 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
425 ; GCN-NEXT: s_setpc_b64 s[30:31]
426 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
429 i32 0, i32 0, i32 0, i32 0)
430 ret <4 x float> %result
434 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
435 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
437 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
439 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
440 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
441 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
443 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
446 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
447 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
448 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
449 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
450 ; GCN-NEXT: s_setpc_b64 s[30:31]
451 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
454 i32 0, i32 %scale0, i32 0, i32 %scale1)
455 ret <4 x float> %result
458 ; This should be optimized to avoid the scale
459 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
460 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
462 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
464 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
465 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
466 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
468 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
471 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
472 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
473 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
474 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
475 ; GCN-NEXT: s_setpc_b64 s[30:31]
476 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
479 i32 0, i32 0, i32 0, i32 0)
480 ret <4 x float> %result
484 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
485 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
487 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
489 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
490 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
491 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
493 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
496 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
497 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
498 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
499 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
500 ; GCN-NEXT: s_setpc_b64 s[30:31]
501 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
504 i32 0, i32 %scale0, i32 0, i32 %scale1)
505 ret <4 x float> %result
509 ; This should be optimized to avoid the scale
510 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
511 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
513 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
515 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
516 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
517 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
519 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
522 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
523 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
524 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
525 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
526 ; GCN-NEXT: s_setpc_b64 s[30:31]
527 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
530 i32 0, i32 0, i32 0, i32 0)
531 ret <4 x float> %result
535 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
536 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
538 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
540 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
541 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
542 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
544 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
547 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
548 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
549 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
550 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
551 ; GCN-NEXT: s_setpc_b64 s[30:31]
552 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
555 i32 0, i32 %scale0, i32 0, i32 %scale1)
556 ret <4 x float> %result
559 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
560 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
562 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
564 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
565 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
566 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
568 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
571 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
572 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
573 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
574 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
575 ; GCN-NEXT: s_setpc_b64 s[30:31]
576 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
579 i32 0, i32 0, i32 0, i32 0)
580 ret <4 x float> %result
584 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
585 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
587 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
589 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
590 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
591 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
593 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
596 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
597 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
598 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
599 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
600 ; GCN-NEXT: s_setpc_b64 s[30:31]
601 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
604 i32 0, i32 %scale0, i32 0, i32 %scale1)
605 ret <4 x float> %result
608 ; This should be optimized to avoid the scale
609 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
610 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
612 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
614 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
615 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
616 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
618 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
621 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
622 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
623 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
624 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
625 ; GCN-NEXT: s_setpc_b64 s[30:31]
626 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
629 i32 0, i32 0, i32 0, i32 0)
630 ret <4 x float> %result
634 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
635 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
637 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
639 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
640 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
641 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
643 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
646 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
647 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
648 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
649 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
650 ; GCN-NEXT: s_setpc_b64 s[30:31]
651 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
654 i32 0, i32 %scale0, i32 0, i32 %scale1)
655 ret <4 x float> %result
658 ; This should be optimized to avoid the scale
659 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
660 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
662 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
664 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
665 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
666 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
668 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
671 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
672 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
673 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
674 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
675 ; GCN-NEXT: s_setpc_b64 s[30:31]
676 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
679 i32 0, i32 0, i32 0, i32 0)
680 ret <4 x float> %result
684 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
685 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
687 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
689 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
690 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
691 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
693 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
696 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
697 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
698 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
699 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
700 ; GCN-NEXT: s_setpc_b64 s[30:31]
701 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
704 i32 0, i32 %scale0, i32 0, i32 %scale1)
705 ret <4 x float> %result
708 ; This should be optimized to avoid the scale
709 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
710 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
712 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
713 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
714 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
715 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
716 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
718 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
721 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
722 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
723 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
724 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
725 ; GCN-NEXT: s_setpc_b64 s[30:31]
726 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
729 i32 0, i32 0, i32 0, i32 0)
730 ret <4 x float> %result
734 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
735 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
737 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
739 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
740 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
741 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
743 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
746 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
747 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
748 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
749 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
750 ; GCN-NEXT: s_setpc_b64 s[30:31]
751 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
754 i32 0, i32 %scale0, i32 0, i32 %scale1)
755 ret <4 x float> %result
758 ; This should be optimized to avoid the scale
759 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
760 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
762 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
764 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
765 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
766 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
768 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
771 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
772 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
773 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
774 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
775 ; GCN-NEXT: s_setpc_b64 s[30:31]
776 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
779 i32 0, i32 0, i32 0, i32 0)
780 ret <4 x float> %result
784 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
785 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
787 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
788 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
789 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
790 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
791 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
793 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
795 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
796 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
797 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
798 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
799 ; GCN-NEXT: s_setpc_b64 s[30:31]
800 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
803 i32 0, i32 %scale0, i32 0, i32 %scale1)
804 ret <4 x float> %result
807 ; This should be optimized to avoid the scale
808 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
809 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
811 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
812 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
813 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
814 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
815 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
817 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
819 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
820 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
821 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
822 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
823 ; GCN-NEXT: s_setpc_b64 s[30:31]
824 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
827 i32 0, i32 0, i32 0, i32 0)
828 ret <4 x float> %result
832 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
833 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
835 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
836 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
837 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
838 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
839 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
841 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
843 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
844 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
845 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
846 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
847 ; GCN-NEXT: s_setpc_b64 s[30:31]
848 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
851 i32 0, i32 %scale0, i32 0, i32 %scale1)
852 ret <4 x float> %result
855 ; This should be optimized to avoid the scale
856 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
857 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
859 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
861 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
862 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
863 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
865 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
867 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
868 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
869 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
870 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
871 ; GCN-NEXT: s_setpc_b64 s[30:31]
872 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
875 i32 0, i32 0, i32 0, i32 0)
876 ret <4 x float> %result
881 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
882 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
884 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
886 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
887 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
888 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
890 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
893 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
894 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
895 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
896 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
897 ; GCN-NEXT: s_setpc_b64 s[30:31]
898 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
901 i32 0, i32 %scale0, i32 0, i32 %scale1)
902 ret <4 x float> %result
905 ; This should be optimized to avoid the scale
906 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
907 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
909 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
911 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
912 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
913 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
915 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
918 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
919 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
920 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
921 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
922 ; GCN-NEXT: s_setpc_b64 s[30:31]
923 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
926 i32 0, i32 0, i32 0, i32 0)
927 ret <4 x float> %result
931 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
932 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
934 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
935 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
936 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
937 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
938 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
940 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
943 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
944 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
945 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
946 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
947 ; GCN-NEXT: s_setpc_b64 s[30:31]
948 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
951 i32 0, i32 %scale0, i32 0, i32 %scale1)
952 ret <4 x float> %result
955 ; This should be optimized to avoid the scale
956 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
957 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
959 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
960 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
961 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
962 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
963 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
965 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
968 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
969 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
970 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
971 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
972 ; GCN-NEXT: s_setpc_b64 s[30:31]
973 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
976 i32 0, i32 0, i32 0, i32 0)
977 ret <4 x float> %result
981 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
982 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
984 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
985 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
986 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
987 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
988 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
990 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
992 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
993 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
994 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
995 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
996 ; GCN-NEXT: s_setpc_b64 s[30:31]
997 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1000 i32 0, i32 %scale0, i32 0, i32 %scale1)
1001 ret <4 x float> %result
1004 ; This should be optimized to avoid the scale
1005 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
1006 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
1008 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1009 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1010 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1011 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1012 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1014 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
1016 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1017 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1018 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1019 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1020 ; GCN-NEXT: s_setpc_b64 s[30:31]
1021 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1024 i32 0, i32 0, i32 0, i32 0)
1025 ret <4 x float> %result
1029 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1030 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
1032 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1034 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1035 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1036 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1038 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
1040 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1041 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1042 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1043 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1044 ; GCN-NEXT: s_setpc_b64 s[30:31]
1045 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1048 i32 0, i32 %scale0, i32 0, i32 %scale1)
1049 ret <4 x float> %result
1052 ; This should be optimized to avoid the scale
1053 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
1054 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
1056 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1058 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1059 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1060 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1062 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
1064 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1065 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1066 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1067 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1068 ; GCN-NEXT: s_setpc_b64 s[30:31]
1069 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1072 i32 0, i32 0, i32 0, i32 0)
1073 ret <4 x float> %result
1077 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1078 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
1080 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1081 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1082 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1083 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1084 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1086 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
1088 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1089 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1090 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1091 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1092 ; GCN-NEXT: s_setpc_b64 s[30:31]
1093 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1096 i32 0, i32 %scale0, i32 0, i32 %scale1)
1097 ret <4 x float> %result
1100 ; This should be optimized to avoid the scale
1101 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
1102 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
1104 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1105 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1106 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1107 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1108 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1110 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
1112 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1113 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1114 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1115 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1116 ; GCN-NEXT: s_setpc_b64 s[30:31]
1117 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1120 i32 0, i32 0, i32 0, i32 0)
1121 ret <4 x float> %result
1125 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1126 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
1128 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1129 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1130 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1131 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1132 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1134 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
1136 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1137 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1138 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1139 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1140 ; GCN-NEXT: s_setpc_b64 s[30:31]
1141 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1144 i32 0, i32 %scale0, i32 0, i32 %scale1)
1145 ret <4 x float> %result
1148 ; This should be optimized to avoid the scale
1149 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
1150 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
1152 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1154 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1155 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1156 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1158 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
1160 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1161 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1162 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1163 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1164 ; GCN-NEXT: s_setpc_b64 s[30:31]
1165 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1168 i32 0, i32 0, i32 0, i32 0)
1169 ret <4 x float> %result
1173 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1174 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
1176 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1177 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1178 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1179 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1180 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1182 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
1185 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1186 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1187 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1188 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1189 ; GCN-NEXT: s_setpc_b64 s[30:31]
1190 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
1193 i32 0, i32 %scale0, i32 0, i32 %scale1)
1194 ret <4 x float> %result
1197 ; This should be optimized to avoid the scale
1198 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
1199 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
1201 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1202 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1203 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1204 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1205 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1207 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
1210 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1211 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1212 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1213 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1214 ; GCN-NEXT: s_setpc_b64 s[30:31]
1215 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
1218 i32 0, i32 0, i32 0, i32 0)
1219 ret <4 x float> %result
1223 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1224 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
1226 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1227 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1228 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1229 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1230 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1232 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
1235 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1236 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1237 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1238 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1239 ; GCN-NEXT: s_setpc_b64 s[30:31]
1240 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
1243 i32 0, i32 %scale0, i32 0, i32 %scale1)
1244 ret <4 x float> %result
1247 ; This should be optimized to avoid the scale
1248 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
1249 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
1251 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1252 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
1253 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
1254 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
1255 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
1257 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
1260 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1261 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1262 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1263 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1264 ; GCN-NEXT: s_setpc_b64 s[30:31]
1265 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
1268 i32 0, i32 0, i32 0, i32 0)
1269 ret <4 x float> %result
1273 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1274 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
1276 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1277 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1278 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1279 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1280 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1282 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
1284 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1285 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1286 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1287 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1288 ; GCN-NEXT: s_setpc_b64 s[30:31]
1289 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1292 i32 0, i32 %scale0, i32 0, i32 %scale1)
1293 ret <4 x float> %result
1296 ; This should be optimized to avoid the scale
1297 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
1298 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
1300 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1302 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1303 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1304 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1306 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
1308 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1309 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1310 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1311 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1312 ; GCN-NEXT: s_setpc_b64 s[30:31]
1313 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1316 i32 0, i32 0, i32 0, i32 0)
1317 ret <4 x float> %result
1321 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1322 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
1324 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1325 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1326 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1327 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1328 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1330 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
1332 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1333 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1334 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1335 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1336 ; GCN-NEXT: s_setpc_b64 s[30:31]
1337 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1340 i32 0, i32 %scale0, i32 0, i32 %scale1)
1341 ret <4 x float> %result
1344 ; This should be optimized to avoid the scale
1345 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
1346 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
1348 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349 ; GCN-NEXT: v_accvgpr_write_b32 a0, v10
1350 ; GCN-NEXT: v_accvgpr_write_b32 a1, v11
1351 ; GCN-NEXT: v_accvgpr_write_b32 a2, v12
1352 ; GCN-NEXT: v_accvgpr_write_b32 a3, v13
1354 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
1356 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1357 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1358 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1359 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1360 ; GCN-NEXT: s_setpc_b64 s[30:31]
1361 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
1364 i32 0, i32 0, i32 0, i32 0)
1365 ret <4 x float> %result
1369 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1370 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
1372 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1373 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8
1374 ; GCN-NEXT: v_accvgpr_write_b32 a1, v9
1375 ; GCN-NEXT: v_accvgpr_write_b32 a2, v10
1376 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11
1378 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
1380 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1381 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1382 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1383 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1384 ; GCN-NEXT: s_setpc_b64 s[30:31]
1385 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1388 i32 0, i32 %scale0, i32 0, i32 %scale1)
1389 ret <4 x float> %result
1392 ; This should be optimized to avoid the scale
1393 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
1394 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
1396 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397 ; GCN-NEXT: v_accvgpr_write_b32 a0, v8
1398 ; GCN-NEXT: v_accvgpr_write_b32 a1, v9
1399 ; GCN-NEXT: v_accvgpr_write_b32 a2, v10
1400 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11
1402 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
1404 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1405 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1406 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1407 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1408 ; GCN-NEXT: s_setpc_b64 s[30:31]
1409 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
1412 i32 0, i32 0, i32 0, i32 0)
1413 ret <4 x float> %result
1416 ; --------------------------------------------------------------------
1417 ; Different input parameter classes
1418 ; --------------------------------------------------------------------
1420 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
1421 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
1423 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1424 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
1425 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
1426 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
1427 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
1428 ; GCN-NEXT: v_mov_b32_e32 v16, s1
1430 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
1433 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1434 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1435 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1436 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1437 ; GCN-NEXT: s_setpc_b64 s[30:31]
1438 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1439 ret <4 x float> %result
1442 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
1443 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
1445 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1446 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
1447 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
1448 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
1449 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
1451 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
1454 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1455 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1456 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1457 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1458 ; GCN-NEXT: s_setpc_b64 s[30:31]
1459 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1460 ret <4 x float> %result
1463 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
1464 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
1466 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1467 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
1468 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
1469 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
1470 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
1472 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
1475 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1476 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1477 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1478 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1479 ; GCN-NEXT: s_setpc_b64 s[30:31]
1480 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1481 ret <4 x float> %result
1484 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
1485 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
1487 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1488 ; SDAG-NEXT: v_mov_b32_e32 v12, s0
1489 ; SDAG-NEXT: v_mov_b32_e32 v13, s1
1490 ; SDAG-NEXT: v_mov_b32_e32 v14, s2
1491 ; SDAG-NEXT: v_mov_b32_e32 v15, s3
1492 ; SDAG-NEXT: v_mov_b32_e32 v16, s16
1493 ; SDAG-NEXT: v_mov_b32_e32 v17, s17
1494 ; SDAG-NEXT: v_mov_b32_e32 v18, s18
1495 ; SDAG-NEXT: v_mov_b32_e32 v19, s19
1496 ; SDAG-NEXT: v_mov_b32_e32 v20, s28
1497 ; SDAG-NEXT: v_mov_b32_e32 v23, v1
1498 ; SDAG-NEXT: v_mov_b32_e32 v22, v0
1499 ; SDAG-NEXT: v_mov_b32_e32 v21, s29
1500 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
1501 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
1502 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
1503 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
1504 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
1505 ; SDAG-NEXT: v_mov_b32_e32 v8, s24
1506 ; SDAG-NEXT: v_mov_b32_e32 v9, s25
1507 ; SDAG-NEXT: v_mov_b32_e32 v10, s26
1508 ; SDAG-NEXT: v_mov_b32_e32 v11, s27
1509 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
1510 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v22
1511 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v23
1512 ; SDAG-NEXT: s_nop 1
1513 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
1514 ; SDAG-NEXT: s_nop 7
1515 ; SDAG-NEXT: s_nop 2
1516 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1517 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1518 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1519 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1520 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1522 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
1524 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1525 ; GISEL-NEXT: s_mov_b32 s12, s0
1526 ; GISEL-NEXT: s_mov_b32 s13, s1
1527 ; GISEL-NEXT: s_mov_b32 s14, s2
1528 ; GISEL-NEXT: s_mov_b32 s15, s3
1529 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
1530 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
1531 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
1532 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
1533 ; GISEL-NEXT: v_mov_b32_e32 v20, s28
1534 ; GISEL-NEXT: v_mov_b32_e32 v22, v0
1535 ; GISEL-NEXT: v_mov_b32_e32 v23, v1
1536 ; GISEL-NEXT: v_mov_b32_e32 v21, s29
1537 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
1538 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
1539 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
1540 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
1541 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
1542 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
1543 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v22
1544 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v23
1545 ; GISEL-NEXT: s_nop 1
1546 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
1547 ; GISEL-NEXT: s_nop 7
1548 ; GISEL-NEXT: s_nop 2
1549 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1550 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1551 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1552 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1553 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1554 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1555 ret <4 x float> %result
1558 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
1559 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
1561 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1562 ; SDAG-NEXT: v_mov_b32_e32 v14, s0
1563 ; SDAG-NEXT: v_mov_b32_e32 v15, s1
1564 ; SDAG-NEXT: v_mov_b32_e32 v16, s2
1565 ; SDAG-NEXT: v_mov_b32_e32 v17, s3
1566 ; SDAG-NEXT: v_mov_b32_e32 v18, s16
1567 ; SDAG-NEXT: v_mov_b32_e32 v19, s17
1568 ; SDAG-NEXT: v_mov_b32_e32 v20, s18
1569 ; SDAG-NEXT: v_mov_b32_e32 v21, s19
1570 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1571 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1572 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1573 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1574 ; SDAG-NEXT: s_nop 1
1575 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
1576 ; SDAG-NEXT: s_nop 7
1577 ; SDAG-NEXT: s_nop 2
1578 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1579 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1580 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1581 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1582 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1584 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
1586 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1587 ; GISEL-NEXT: s_mov_b32 s12, s0
1588 ; GISEL-NEXT: s_mov_b32 s13, s1
1589 ; GISEL-NEXT: s_mov_b32 s14, s2
1590 ; GISEL-NEXT: s_mov_b32 s15, s3
1591 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
1592 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
1593 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
1594 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
1595 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1596 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1597 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1598 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1599 ; GISEL-NEXT: s_nop 1
1600 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
1601 ; GISEL-NEXT: s_nop 7
1602 ; GISEL-NEXT: s_nop 2
1603 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1604 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1605 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1606 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1607 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1608 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1609 ret <4 x float> %result
1612 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
1613 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
1615 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1616 ; SDAG-NEXT: v_mov_b32_e32 v14, s0
1617 ; SDAG-NEXT: v_mov_b32_e32 v15, s1
1618 ; SDAG-NEXT: v_mov_b32_e32 v16, s2
1619 ; SDAG-NEXT: v_mov_b32_e32 v17, s3
1620 ; SDAG-NEXT: v_mov_b32_e32 v18, s16
1621 ; SDAG-NEXT: v_mov_b32_e32 v19, s17
1622 ; SDAG-NEXT: v_mov_b32_e32 v20, s18
1623 ; SDAG-NEXT: v_mov_b32_e32 v21, s19
1624 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1625 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1626 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1627 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1628 ; SDAG-NEXT: s_nop 1
1629 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
1630 ; SDAG-NEXT: s_nop 7
1631 ; SDAG-NEXT: s_nop 2
1632 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1633 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1634 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1635 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1636 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1638 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
1640 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1641 ; GISEL-NEXT: s_mov_b32 s12, s0
1642 ; GISEL-NEXT: s_mov_b32 s13, s1
1643 ; GISEL-NEXT: s_mov_b32 s14, s2
1644 ; GISEL-NEXT: s_mov_b32 s15, s3
1645 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
1646 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
1647 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
1648 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
1649 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1650 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1651 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1652 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1653 ; GISEL-NEXT: s_nop 1
1654 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
1655 ; GISEL-NEXT: s_nop 7
1656 ; GISEL-NEXT: s_nop 2
1657 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1658 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1659 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1660 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1661 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1662 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1663 ret <4 x float> %result
1666 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
1667 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
1669 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1670 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
1671 ; SDAG-NEXT: v_mov_b32_e32 v14, s0
1672 ; SDAG-NEXT: v_mov_b32_e32 v15, s1
1673 ; SDAG-NEXT: v_mov_b32_e32 v16, s2
1674 ; SDAG-NEXT: v_mov_b32_e32 v17, s3
1675 ; SDAG-NEXT: v_mov_b32_e32 v18, s16
1676 ; SDAG-NEXT: v_mov_b32_e32 v19, s17
1677 ; SDAG-NEXT: v_mov_b32_e32 v20, s18
1678 ; SDAG-NEXT: v_mov_b32_e32 v21, s19
1679 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
1680 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
1681 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
1682 ; SDAG-NEXT: s_nop 1
1683 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
1684 ; SDAG-NEXT: s_nop 7
1685 ; SDAG-NEXT: s_nop 2
1686 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1687 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1688 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1689 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1690 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1692 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
1694 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1695 ; GISEL-NEXT: s_mov_b32 s12, s0
1696 ; GISEL-NEXT: s_mov_b32 s13, s1
1697 ; GISEL-NEXT: s_mov_b32 s14, s2
1698 ; GISEL-NEXT: s_mov_b32 s15, s3
1699 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
1700 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
1701 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
1702 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
1703 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
1704 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
1705 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
1706 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
1707 ; GISEL-NEXT: s_nop 1
1708 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
1709 ; GISEL-NEXT: s_nop 7
1710 ; GISEL-NEXT: s_nop 2
1711 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1712 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1713 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1714 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1715 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1716 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1717 ret <4 x float> %result
1720 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
1721 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
1723 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
1725 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
1726 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
1727 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
1729 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0]
1732 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1733 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1734 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1735 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1736 ; GCN-NEXT: s_setpc_b64 s[30:31]
1737 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1738 ret <4 x float> %result
1741 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
1742 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
1744 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1745 ; SDAG-NEXT: v_mov_b32_e32 v10, s0
1746 ; SDAG-NEXT: v_mov_b32_e32 v11, s1
1747 ; SDAG-NEXT: v_mov_b32_e32 v12, s2
1748 ; SDAG-NEXT: v_mov_b32_e32 v13, s3
1749 ; SDAG-NEXT: v_mov_b32_e32 v14, s16
1750 ; SDAG-NEXT: v_mov_b32_e32 v15, s17
1751 ; SDAG-NEXT: v_mov_b32_e32 v16, s18
1752 ; SDAG-NEXT: v_mov_b32_e32 v17, s19
1753 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s20
1754 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
1755 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
1756 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
1757 ; SDAG-NEXT: s_nop 1
1758 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
1759 ; SDAG-NEXT: s_nop 7
1760 ; SDAG-NEXT: s_nop 2
1761 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1762 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1763 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1764 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1765 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1767 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
1769 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1770 ; GISEL-NEXT: s_mov_b32 s12, s0
1771 ; GISEL-NEXT: s_mov_b32 s13, s1
1772 ; GISEL-NEXT: s_mov_b32 s14, s2
1773 ; GISEL-NEXT: s_mov_b32 s15, s3
1774 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
1775 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
1776 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
1777 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
1778 ; GISEL-NEXT: v_accvgpr_write_b32 a0, s20
1779 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
1780 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
1781 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
1782 ; GISEL-NEXT: s_nop 1
1783 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0]
1784 ; GISEL-NEXT: s_nop 7
1785 ; GISEL-NEXT: s_nop 2
1786 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1787 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1788 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1789 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1790 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1791 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
1792 ret <4 x float> %result
1795 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1796 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
1798 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
1800 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
1801 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
1802 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
1804 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
1807 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
1808 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
1809 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
1810 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
1811 ; GCN-NEXT: s_setpc_b64 s[30:31]
1812 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
1813 ret <4 x float> %result
1816 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1817 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
1819 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1820 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
1821 ; SDAG-NEXT: s_movk_i32 s0, 0x41
1822 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
1823 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
1824 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
1825 ; SDAG-NEXT: s_nop 1
1826 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
1827 ; SDAG-NEXT: s_nop 7
1828 ; SDAG-NEXT: s_nop 2
1829 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1830 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1831 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1832 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1833 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1835 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
1837 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1838 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
1839 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
1840 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
1841 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
1842 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
1843 ; GISEL-NEXT: s_nop 1
1844 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
1845 ; GISEL-NEXT: s_nop 7
1846 ; GISEL-NEXT: s_nop 2
1847 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1848 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1849 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1850 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1851 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1852 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
1853 ret <4 x float> %result
1856 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
1857 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
1859 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1860 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
1861 ; SDAG-NEXT: s_movk_i32 s0, 0x41
1862 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
1863 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
1864 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
1865 ; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
1866 ; SDAG-NEXT: s_nop 1
1867 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
1868 ; SDAG-NEXT: s_nop 7
1869 ; SDAG-NEXT: s_nop 2
1870 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1871 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1872 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1873 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1874 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1876 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
1878 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1879 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
1880 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
1881 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
1882 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
1883 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
1884 ; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
1885 ; GISEL-NEXT: s_nop 1
1886 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
1887 ; GISEL-NEXT: s_nop 7
1888 ; GISEL-NEXT: s_nop 2
1889 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
1890 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
1891 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
1892 ; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
1893 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1894 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
1895 ret <4 x float> %result
1898 define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
1899 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
1901 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
1902 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1903 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1904 ; SDAG-NEXT: v_mov_b32_e32 v0, s8
1905 ; SDAG-NEXT: v_mov_b32_e32 v1, s9
1906 ; SDAG-NEXT: v_mov_b32_e32 v2, s10
1907 ; SDAG-NEXT: v_mov_b32_e32 v3, s11
1908 ; SDAG-NEXT: v_mov_b32_e32 v4, s12
1909 ; SDAG-NEXT: v_mov_b32_e32 v5, s13
1910 ; SDAG-NEXT: v_mov_b32_e32 v6, s14
1911 ; SDAG-NEXT: v_mov_b32_e32 v7, s15
1912 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
1913 ; SDAG-NEXT: v_mov_b32_e32 v8, s16
1914 ; SDAG-NEXT: v_mov_b32_e32 v9, s17
1915 ; SDAG-NEXT: v_mov_b32_e32 v10, s18
1916 ; SDAG-NEXT: v_mov_b32_e32 v11, s19
1917 ; SDAG-NEXT: v_mov_b32_e32 v12, s20
1918 ; SDAG-NEXT: v_mov_b32_e32 v13, s21
1919 ; SDAG-NEXT: v_mov_b32_e32 v14, s22
1920 ; SDAG-NEXT: v_mov_b32_e32 v15, s23
1921 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1922 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
1923 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
1924 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
1925 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
1926 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
1927 ; SDAG-NEXT: s_nop 1
1928 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
1929 ; SDAG-NEXT: s_nop 7
1930 ; SDAG-NEXT: s_nop 2
1931 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
1932 ; SDAG-NEXT: s_endpgm
1934 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
1936 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
1937 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40
1938 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1939 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
1940 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
1941 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
1942 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
1943 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
1944 ; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
1945 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
1946 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
1947 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
1948 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
1949 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
1950 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
1951 ; GISEL-NEXT: v_mov_b32_e32 v16, s29
1952 ; GISEL-NEXT: s_nop 1
1953 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
1954 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1955 ; GISEL-NEXT: s_nop 7
1956 ; GISEL-NEXT: s_nop 1
1957 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
1958 ; GISEL-NEXT: s_endpgm
1959 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
1960 store <4 x float> %result, ptr addrspace(1) %ptr, align 16
1964 define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
1965 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
1967 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
1968 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
1969 ; SDAG-NEXT: s_movk_i32 s6, 0x41
1970 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
1971 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1972 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1973 ; SDAG-NEXT: v_mov_b32_e32 v0, s8
1974 ; SDAG-NEXT: v_mov_b32_e32 v1, s9
1975 ; SDAG-NEXT: v_mov_b32_e32 v2, s10
1976 ; SDAG-NEXT: v_mov_b32_e32 v3, s11
1977 ; SDAG-NEXT: v_mov_b32_e32 v4, s12
1978 ; SDAG-NEXT: v_mov_b32_e32 v5, s13
1979 ; SDAG-NEXT: v_mov_b32_e32 v6, s14
1980 ; SDAG-NEXT: v_mov_b32_e32 v7, s15
1981 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
1982 ; SDAG-NEXT: v_mov_b32_e32 v8, s16
1983 ; SDAG-NEXT: v_mov_b32_e32 v9, s17
1984 ; SDAG-NEXT: v_mov_b32_e32 v10, s18
1985 ; SDAG-NEXT: v_mov_b32_e32 v11, s19
1986 ; SDAG-NEXT: v_mov_b32_e32 v12, s20
1987 ; SDAG-NEXT: v_mov_b32_e32 v13, s21
1988 ; SDAG-NEXT: v_mov_b32_e32 v14, s22
1989 ; SDAG-NEXT: v_mov_b32_e32 v15, s23
1990 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
1991 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
1992 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
1993 ; SDAG-NEXT: s_nop 1
1994 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
1995 ; SDAG-NEXT: s_nop 7
1996 ; SDAG-NEXT: s_nop 2
1997 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
1998 ; SDAG-NEXT: s_endpgm
2000 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
2002 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2003 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2004 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2005 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2006 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2007 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2008 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2009 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2010 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2011 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2012 ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2013 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2014 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2015 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2016 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2017 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2018 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2019 ; GISEL-NEXT: s_nop 1
2020 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
2021 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2022 ; GISEL-NEXT: s_nop 7
2023 ; GISEL-NEXT: s_nop 1
2024 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2025 ; GISEL-NEXT: s_endpgm
2026 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
2027 store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2031 ; This should be optimized to avoid the scale
2032 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2033 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
2035 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2036 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2037 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2038 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2039 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2041 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
2044 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2045 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2046 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2047 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2048 ; GCN-NEXT: s_setpc_b64 s[30:31]
2049 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
2050 ret <4 x float> %result
2053 ; This should be optimized to avoid the scale, with non-0 op_sel arguments.
2054 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2055 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
2057 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2059 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2060 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2061 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2063 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
2066 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2067 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2068 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2069 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2070 ; GCN-NEXT: s_setpc_b64 s[30:31]
2071 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
2072 ret <4 x float> %result
2075 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2076 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
2078 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2079 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2080 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2081 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2082 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2084 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
2087 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2088 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2089 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2090 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2091 ; GCN-NEXT: s_setpc_b64 s[30:31]
2092 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
2093 ret <4 x float> %result
2096 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2097 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
2099 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2100 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2101 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2102 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2103 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2105 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
2108 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2109 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2110 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2111 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2112 ; GCN-NEXT: s_setpc_b64 s[30:31]
2113 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
2114 ret <4 x float> %result
2117 ; --------------------------------------------------------------------
2118 ; Incorrect signature for format cases (IR vector too large)
2119 ; --------------------------------------------------------------------
2121 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2122 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
2124 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2125 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2126 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2127 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2128 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2130 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
2133 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2134 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2135 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2136 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2137 ; GCN-NEXT: s_setpc_b64 s[30:31]
2138 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2141 i32 0, i32 %scale0, i32 0, i32 %scale1)
2142 ret <4 x float> %result
2145 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2146 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
2148 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2150 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2151 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2152 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2154 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
2157 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2158 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2159 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2160 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2161 ; GCN-NEXT: s_setpc_b64 s[30:31]
2162 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2165 i32 0, i32 %scale0, i32 0, i32 %scale1)
2166 ret <4 x float> %result
2169 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2170 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
2172 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2173 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2174 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2175 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2176 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2178 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
2180 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2181 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2182 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2183 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2184 ; GCN-NEXT: s_setpc_b64 s[30:31]
2185 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2188 i32 0, i32 %scale0, i32 0, i32 %scale1)
2189 ret <4 x float> %result
2192 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
2193 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
2195 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2196 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2197 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2198 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2199 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2201 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
2203 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2204 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2205 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2206 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2207 ; GCN-NEXT: s_setpc_b64 s[30:31]
2208 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2211 i32 0, i32 0, i32 0, i32 0)
2212 ret <4 x float> %result
2215 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2216 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
2218 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2219 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2220 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2221 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2222 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2224 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
2227 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2228 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2229 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2230 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2231 ; GCN-NEXT: s_setpc_b64 s[30:31]
2232 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2235 i32 0, i32 %scale0, i32 0, i32 %scale1)
2236 ret <4 x float> %result
2239 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2240 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
2242 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2243 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2244 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2245 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2246 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2248 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
2251 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2252 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2253 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2254 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2255 ; GCN-NEXT: s_setpc_b64 s[30:31]
2256 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2259 i32 0, i32 %scale0, i32 0, i32 %scale1)
2260 ret <4 x float> %result
2263 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2264 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
2266 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2267 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
2268 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
2269 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
2270 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
2272 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
2275 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2276 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2277 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2278 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2279 ; GCN-NEXT: s_setpc_b64 s[30:31]
2280 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
2283 i32 0, i32 %scale0, i32 0, i32 %scale1)
2284 ret <4 x float> %result
2287 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2288 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
2290 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2291 ; GCN-NEXT: v_accvgpr_write_b32 a0, v14
2292 ; GCN-NEXT: v_accvgpr_write_b32 a1, v15
2293 ; GCN-NEXT: v_accvgpr_write_b32 a2, v16
2294 ; GCN-NEXT: v_accvgpr_write_b32 a3, v17
2296 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
2299 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2300 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2301 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2302 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2303 ; GCN-NEXT: s_setpc_b64 s[30:31]
2304 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2307 i32 0, i32 %scale0, i32 0, i32 %scale1)
2308 ret <4 x float> %result
2311 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
2312 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
2314 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2316 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2317 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2318 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2320 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
2322 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2323 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2324 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2325 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2326 ; GCN-NEXT: s_setpc_b64 s[30:31]
2327 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2330 i32 0, i32 %scale0, i32 0, i32 %scale1)
2331 ret <4 x float> %result
2334 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
2335 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
2337 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2338 ; GCN-NEXT: v_accvgpr_write_b32 a0, v16
2339 ; GCN-NEXT: v_accvgpr_write_b32 a1, v17
2340 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18
2341 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19
2343 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
2345 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
2346 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
2347 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
2348 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
2349 ; GCN-NEXT: s_setpc_b64 s[30:31]
2350 %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
2353 i32 0, i32 0, i32 0, i32 0)
2354 ret <4 x float> %result
2357 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2358 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2359 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2360 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2361 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2362 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2363 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2364 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2365 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
2367 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
2368 attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }