1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
5 declare i32 @llvm.amdgcn.workitem.id.x()
7 ; --------------------------------------------------------------------
8 ; llvm.amdgcn.smfmac.f32.16x16x64.f16
9 ; --------------------------------------------------------------------
11 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)
13 define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
14 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
15 ; SDAG: ; %bb.0: ; %bb
16 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
17 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
18 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
19 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
20 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
21 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
22 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
23 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
24 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
25 ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
26 ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
28 ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
29 ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
30 ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
31 ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
32 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
33 ; SDAG-NEXT: s_waitcnt vmcnt(0)
35 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
37 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
40 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
41 ; GISEL: ; %bb.0: ; %bb
42 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
43 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
44 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
45 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
46 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
47 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
48 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
49 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
50 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
51 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
52 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
53 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
54 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
55 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
56 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
57 ; GISEL-NEXT: v_mov_b32_e32 v16, s16
58 ; GISEL-NEXT: s_waitcnt vmcnt(0)
60 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
61 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
63 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
64 ; GISEL-NEXT: s_endpgm
66 %id = call i32 @llvm.amdgcn.workitem.id.x()
67 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
68 %in.1 = load <4 x float>, ptr addrspace(1) %gep
69 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
70 store <4 x float> %mai.1, ptr addrspace(1) %arg
74 define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
75 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
77 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
79 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
80 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
81 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
83 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
85 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
86 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
87 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
88 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
89 ; SDAG-NEXT: s_setpc_b64 s[30:31]
91 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
93 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
96 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
97 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
98 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
99 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
100 ; GISEL-NEXT: s_setpc_b64 s[30:31]
101 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
102 ret <4 x float> %result
105 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
106 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
108 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
110 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
111 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
112 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
114 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
116 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
117 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
118 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
119 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
120 ; SDAG-NEXT: s_setpc_b64 s[30:31]
122 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
124 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
126 ; GISEL-NEXT: s_nop 6
127 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
128 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
129 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
130 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
131 ; GISEL-NEXT: s_setpc_b64 s[30:31]
132 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
133 ret <4 x float> %result
136 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
137 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
139 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
141 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
142 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
143 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
145 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
147 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
148 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
149 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
150 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
151 ; SDAG-NEXT: s_setpc_b64 s[30:31]
153 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
155 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
157 ; GISEL-NEXT: s_nop 6
158 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
159 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
160 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
161 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
162 ; GISEL-NEXT: s_setpc_b64 s[30:31]
163 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
164 ret <4 x float> %result
167 define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
168 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
170 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
172 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
173 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
174 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
175 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
176 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
177 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
178 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
179 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
180 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
181 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
182 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
183 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
184 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
185 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
186 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
187 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
189 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
191 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
192 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
193 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
194 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
195 ; SDAG-NEXT: s_setpc_b64 s[30:31]
197 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
199 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
201 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
202 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
203 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
204 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
205 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
206 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
207 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
208 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
209 ; GISEL-NEXT: s_nop 1
210 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
211 ; GISEL-NEXT: s_setpc_b64 s[30:31]
212 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
213 ret <4 x float> %result
216 ; --------------------------------------------------------------------
217 ; llvm.amdgcn.smfmac.f32.32x32x32.f16
218 ; --------------------------------------------------------------------
220 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)
222 define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
223 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
224 ; SDAG: ; %bb.0: ; %bb
225 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
226 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
227 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
228 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
229 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
230 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
231 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
232 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
233 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
234 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
235 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
236 ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
237 ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
238 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
239 ; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
240 ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
241 ; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
242 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
243 ; SDAG-NEXT: v_mov_b32_e32 v28, s16
244 ; SDAG-NEXT: s_waitcnt vmcnt(0)
246 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
247 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
250 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
251 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
252 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
253 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
254 ; SDAG-NEXT: s_endpgm
256 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
257 ; GISEL: ; %bb.0: ; %bb
258 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
259 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
260 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
261 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
262 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
263 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
264 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
265 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
266 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
267 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
268 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
269 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
270 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
271 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
272 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
273 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
274 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
275 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
276 ; GISEL-NEXT: v_mov_b32_e32 v28, s16
277 ; GISEL-NEXT: s_waitcnt vmcnt(0)
278 ; GISEL-NEXT: s_nop 0
279 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
280 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
281 ; GISEL-NEXT: s_nop 7
282 ; GISEL-NEXT: s_nop 1
283 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
284 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
285 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
286 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
287 ; GISEL-NEXT: s_endpgm
289 %id = call i32 @llvm.amdgcn.workitem.id.x()
290 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
291 %in.1 = load <16 x float>, ptr addrspace(1) %gep
292 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
293 store <16 x float> %mai.1, ptr addrspace(1) %arg
297 define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
298 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
300 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
301 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
302 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
303 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
304 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
305 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
306 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
307 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
308 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
309 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
310 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
311 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
312 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
313 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
314 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
315 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
316 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
318 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
321 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
322 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
323 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
324 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
325 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
326 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
327 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
328 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
329 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
330 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
331 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
332 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
333 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
334 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
335 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
336 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
337 ; SDAG-NEXT: s_setpc_b64 s[30:31]
339 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
341 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
343 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
344 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
345 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
346 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
347 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
348 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
349 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
350 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
351 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
352 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
353 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
354 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
355 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
356 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
357 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
358 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
359 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
360 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
361 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
362 ; GISEL-NEXT: s_nop 1
363 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
364 ; GISEL-NEXT: s_setpc_b64 s[30:31]
365 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
366 ret <16 x float> %result
369 define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
370 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
372 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
374 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
375 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
376 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
377 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
378 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
379 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
380 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
381 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
382 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
383 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
384 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
385 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
386 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
387 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
388 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
390 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
393 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
394 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
395 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
396 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
397 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
398 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
399 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
400 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
401 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
402 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
403 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
404 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
405 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
406 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
407 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
408 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
409 ; SDAG-NEXT: s_setpc_b64 s[30:31]
411 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
413 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
415 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
416 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
417 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
418 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
419 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
420 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
421 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
422 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
423 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
424 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
425 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
426 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
427 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
428 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
429 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
430 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
431 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
432 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
433 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
434 ; GISEL-NEXT: s_nop 1
435 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
436 ; GISEL-NEXT: s_setpc_b64 s[30:31]
437 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
438 ret <16 x float> %result
441 define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
442 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
444 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
446 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
447 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
448 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
449 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
450 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
451 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
452 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
453 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
454 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
455 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
456 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
457 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
458 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
459 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
460 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
462 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
465 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
466 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
467 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
468 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
469 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
470 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
471 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
472 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
473 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
474 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
475 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
476 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
477 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
478 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
479 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
480 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
481 ; SDAG-NEXT: s_setpc_b64 s[30:31]
483 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
485 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
487 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
488 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
489 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
490 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
491 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
492 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
493 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
494 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
495 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
496 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
497 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
498 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
499 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
500 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
501 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
502 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
503 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
504 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
505 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
506 ; GISEL-NEXT: s_nop 1
507 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
508 ; GISEL-NEXT: s_setpc_b64 s[30:31]
509 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
510 ret <16 x float> %result
513 define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
514 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
516 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
518 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
519 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
520 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
521 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
522 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
523 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
524 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
525 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
526 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
527 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
528 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
529 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
530 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
531 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
532 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
533 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
534 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
535 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
536 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
537 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
538 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
539 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
540 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
541 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
542 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
543 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
544 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
545 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
546 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
547 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
548 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
549 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
550 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
551 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
552 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
553 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
554 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
555 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
556 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
557 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
558 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
559 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
560 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
562 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
565 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
566 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
567 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
568 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
569 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
570 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
571 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
572 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
573 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
574 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
575 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
576 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
577 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
578 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
579 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
580 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
581 ; SDAG-NEXT: s_setpc_b64 s[30:31]
583 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
585 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
587 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
588 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
589 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
590 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
591 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
592 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
593 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
594 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
595 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
596 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
597 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
598 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
599 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
600 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
601 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
602 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
603 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
604 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
605 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
606 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
607 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
608 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
609 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
610 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
611 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
612 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
613 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
614 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
615 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
616 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
617 ; GISEL-NEXT: s_nop 1
618 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16
619 ; GISEL-NEXT: s_setpc_b64 s[30:31]
620 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
621 ret <16 x float> %result
624 ; --------------------------------------------------------------------
625 ; llvm.amdgcn.smfmac.f32.16x16x64.bf16
626 ; --------------------------------------------------------------------
628 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)
630 define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
631 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
632 ; GCN: ; %bb.0: ; %bb
633 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
634 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
635 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
636 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
637 ; GCN-NEXT: v_mov_b32_e32 v16, 0
638 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
639 ; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
640 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
641 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
642 ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
643 ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
644 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
645 ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
646 ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
647 ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
648 ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
649 ; GCN-NEXT: v_mov_b32_e32 v17, s16
650 ; GCN-NEXT: s_waitcnt vmcnt(0)
652 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
654 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
657 %id = call i32 @llvm.amdgcn.workitem.id.x()
658 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
659 %in.1 = load <4 x float>, ptr addrspace(1) %gep
660 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
661 store <4 x float> %mai.1, ptr addrspace(1) %arg
665 define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
666 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
668 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
669 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
670 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
671 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
672 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
674 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
676 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
677 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
678 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
679 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
680 ; GCN-NEXT: s_setpc_b64 s[30:31]
681 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
682 ret <4 x float> %result
685 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
686 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
688 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
690 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
691 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
692 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
694 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
696 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
697 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
698 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
699 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
700 ; GCN-NEXT: s_setpc_b64 s[30:31]
701 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
702 ret <4 x float> %result
705 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
706 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
708 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
710 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
711 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
712 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
714 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
716 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
717 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
718 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
719 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
720 ; GCN-NEXT: s_setpc_b64 s[30:31]
721 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
722 ret <4 x float> %result
725 define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
726 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
728 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729 ; GCN-NEXT: v_mov_b32_e32 v8, s0
730 ; GCN-NEXT: v_mov_b32_e32 v9, s1
731 ; GCN-NEXT: v_mov_b32_e32 v10, s2
732 ; GCN-NEXT: v_mov_b32_e32 v11, s3
733 ; GCN-NEXT: v_mov_b32_e32 v0, s16
734 ; GCN-NEXT: v_mov_b32_e32 v1, s17
735 ; GCN-NEXT: v_mov_b32_e32 v2, s18
736 ; GCN-NEXT: v_mov_b32_e32 v3, s19
737 ; GCN-NEXT: v_mov_b32_e32 v4, s20
738 ; GCN-NEXT: v_mov_b32_e32 v5, s21
739 ; GCN-NEXT: v_mov_b32_e32 v6, s22
740 ; GCN-NEXT: v_mov_b32_e32 v7, s23
741 ; GCN-NEXT: v_accvgpr_write_b32 a0, s24
742 ; GCN-NEXT: v_accvgpr_write_b32 a1, s25
743 ; GCN-NEXT: v_accvgpr_write_b32 a2, s26
744 ; GCN-NEXT: v_accvgpr_write_b32 a3, s27
745 ; GCN-NEXT: v_mov_b32_e32 v12, s28
747 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
749 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
750 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
751 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
752 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
753 ; GCN-NEXT: s_setpc_b64 s[30:31]
754 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
755 ret <4 x float> %result
758 ; --------------------------------------------------------------------
759 ; llvm.amdgcn.smfmac.f32.32x32x32.bf16
760 ; --------------------------------------------------------------------
762 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
764 define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
765 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
766 ; GCN: ; %bb.0: ; %bb
767 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
768 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
769 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
770 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0
771 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
772 ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
773 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
774 ; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
775 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
776 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
777 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
778 ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
779 ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
780 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
781 ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
782 ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
783 ; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
784 ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
785 ; GCN-NEXT: v_mov_b32_e32 v28, s16
786 ; GCN-NEXT: s_waitcnt vmcnt(0)
788 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
789 ; GCN-NEXT: v_mov_b32_e32 v16, 0
792 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
793 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
794 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
795 ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
798 %id = call i32 @llvm.amdgcn.workitem.id.x()
799 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
800 %in.1 = load <16 x float>, ptr addrspace(1) %gep
801 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
802 store <16 x float> %mai.1, ptr addrspace(1) %arg
806 define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
807 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
809 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
811 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
812 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
813 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
814 ; GCN-NEXT: v_accvgpr_write_b32 a4, v16
815 ; GCN-NEXT: v_accvgpr_write_b32 a5, v17
816 ; GCN-NEXT: v_accvgpr_write_b32 a6, v18
817 ; GCN-NEXT: v_accvgpr_write_b32 a7, v19
818 ; GCN-NEXT: v_accvgpr_write_b32 a8, v20
819 ; GCN-NEXT: v_accvgpr_write_b32 a9, v21
820 ; GCN-NEXT: v_accvgpr_write_b32 a10, v22
821 ; GCN-NEXT: v_accvgpr_write_b32 a11, v23
822 ; GCN-NEXT: v_accvgpr_write_b32 a12, v24
823 ; GCN-NEXT: v_accvgpr_write_b32 a13, v25
824 ; GCN-NEXT: v_accvgpr_write_b32 a14, v26
825 ; GCN-NEXT: v_accvgpr_write_b32 a15, v27
827 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
830 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
831 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
832 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
833 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
834 ; GCN-NEXT: v_accvgpr_read_b32 v4, a4
835 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5
836 ; GCN-NEXT: v_accvgpr_read_b32 v6, a6
837 ; GCN-NEXT: v_accvgpr_read_b32 v7, a7
838 ; GCN-NEXT: v_accvgpr_read_b32 v8, a8
839 ; GCN-NEXT: v_accvgpr_read_b32 v9, a9
840 ; GCN-NEXT: v_accvgpr_read_b32 v10, a10
841 ; GCN-NEXT: v_accvgpr_read_b32 v11, a11
842 ; GCN-NEXT: v_accvgpr_read_b32 v12, a12
843 ; GCN-NEXT: v_accvgpr_read_b32 v13, a13
844 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14
845 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15
846 ; GCN-NEXT: s_setpc_b64 s[30:31]
847 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
848 ret <16 x float> %result
851 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
852 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
854 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
856 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
857 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
858 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
859 ; GCN-NEXT: v_accvgpr_write_b32 a4, v16
860 ; GCN-NEXT: v_accvgpr_write_b32 a5, v17
861 ; GCN-NEXT: v_accvgpr_write_b32 a6, v18
862 ; GCN-NEXT: v_accvgpr_write_b32 a7, v19
863 ; GCN-NEXT: v_accvgpr_write_b32 a8, v20
864 ; GCN-NEXT: v_accvgpr_write_b32 a9, v21
865 ; GCN-NEXT: v_accvgpr_write_b32 a10, v22
866 ; GCN-NEXT: v_accvgpr_write_b32 a11, v23
867 ; GCN-NEXT: v_accvgpr_write_b32 a12, v24
868 ; GCN-NEXT: v_accvgpr_write_b32 a13, v25
869 ; GCN-NEXT: v_accvgpr_write_b32 a14, v26
870 ; GCN-NEXT: v_accvgpr_write_b32 a15, v27
872 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
875 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
876 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
877 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
878 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
879 ; GCN-NEXT: v_accvgpr_read_b32 v4, a4
880 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5
881 ; GCN-NEXT: v_accvgpr_read_b32 v6, a6
882 ; GCN-NEXT: v_accvgpr_read_b32 v7, a7
883 ; GCN-NEXT: v_accvgpr_read_b32 v8, a8
884 ; GCN-NEXT: v_accvgpr_read_b32 v9, a9
885 ; GCN-NEXT: v_accvgpr_read_b32 v10, a10
886 ; GCN-NEXT: v_accvgpr_read_b32 v11, a11
887 ; GCN-NEXT: v_accvgpr_read_b32 v12, a12
888 ; GCN-NEXT: v_accvgpr_read_b32 v13, a13
889 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14
890 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15
891 ; GCN-NEXT: s_setpc_b64 s[30:31]
892 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
893 ret <16 x float> %result
896 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
897 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
899 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
901 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
902 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
903 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
904 ; GCN-NEXT: v_accvgpr_write_b32 a4, v16
905 ; GCN-NEXT: v_accvgpr_write_b32 a5, v17
906 ; GCN-NEXT: v_accvgpr_write_b32 a6, v18
907 ; GCN-NEXT: v_accvgpr_write_b32 a7, v19
908 ; GCN-NEXT: v_accvgpr_write_b32 a8, v20
909 ; GCN-NEXT: v_accvgpr_write_b32 a9, v21
910 ; GCN-NEXT: v_accvgpr_write_b32 a10, v22
911 ; GCN-NEXT: v_accvgpr_write_b32 a11, v23
912 ; GCN-NEXT: v_accvgpr_write_b32 a12, v24
913 ; GCN-NEXT: v_accvgpr_write_b32 a13, v25
914 ; GCN-NEXT: v_accvgpr_write_b32 a14, v26
915 ; GCN-NEXT: v_accvgpr_write_b32 a15, v27
917 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
920 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
921 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
922 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
923 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
924 ; GCN-NEXT: v_accvgpr_read_b32 v4, a4
925 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5
926 ; GCN-NEXT: v_accvgpr_read_b32 v6, a6
927 ; GCN-NEXT: v_accvgpr_read_b32 v7, a7
928 ; GCN-NEXT: v_accvgpr_read_b32 v8, a8
929 ; GCN-NEXT: v_accvgpr_read_b32 v9, a9
930 ; GCN-NEXT: v_accvgpr_read_b32 v10, a10
931 ; GCN-NEXT: v_accvgpr_read_b32 v11, a11
932 ; GCN-NEXT: v_accvgpr_read_b32 v12, a12
933 ; GCN-NEXT: v_accvgpr_read_b32 v13, a13
934 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14
935 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15
936 ; GCN-NEXT: s_setpc_b64 s[30:31]
937 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
938 ret <16 x float> %result
941 define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
942 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
944 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945 ; GCN-NEXT: v_mov_b32_e32 v28, s0
946 ; GCN-NEXT: v_mov_b32_e32 v29, s1
947 ; GCN-NEXT: v_mov_b32_e32 v30, s2
948 ; GCN-NEXT: v_mov_b32_e32 v31, s3
949 ; GCN-NEXT: v_mov_b32_e32 v12, s24
950 ; GCN-NEXT: v_mov_b32_e32 v27, v9
951 ; GCN-NEXT: v_mov_b32_e32 v26, v8
952 ; GCN-NEXT: v_mov_b32_e32 v25, v7
953 ; GCN-NEXT: v_mov_b32_e32 v24, v6
954 ; GCN-NEXT: v_mov_b32_e32 v23, v5
955 ; GCN-NEXT: v_mov_b32_e32 v22, v4
956 ; GCN-NEXT: v_mov_b32_e32 v21, v3
957 ; GCN-NEXT: v_mov_b32_e32 v20, v2
958 ; GCN-NEXT: v_mov_b32_e32 v19, v1
959 ; GCN-NEXT: v_mov_b32_e32 v18, v0
960 ; GCN-NEXT: v_mov_b32_e32 v13, s25
961 ; GCN-NEXT: v_mov_b32_e32 v14, s26
962 ; GCN-NEXT: v_mov_b32_e32 v15, s27
963 ; GCN-NEXT: v_mov_b32_e32 v16, s28
964 ; GCN-NEXT: v_mov_b32_e32 v17, s29
965 ; GCN-NEXT: v_accvgpr_write_b32 a0, v12
966 ; GCN-NEXT: v_mov_b32_e32 v0, s16
967 ; GCN-NEXT: v_mov_b32_e32 v1, s17
968 ; GCN-NEXT: v_mov_b32_e32 v2, s18
969 ; GCN-NEXT: v_mov_b32_e32 v3, s19
970 ; GCN-NEXT: v_mov_b32_e32 v4, s20
971 ; GCN-NEXT: v_mov_b32_e32 v5, s21
972 ; GCN-NEXT: v_mov_b32_e32 v6, s22
973 ; GCN-NEXT: v_mov_b32_e32 v7, s23
974 ; GCN-NEXT: v_accvgpr_write_b32 a1, v13
975 ; GCN-NEXT: v_accvgpr_write_b32 a2, v14
976 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15
977 ; GCN-NEXT: v_accvgpr_write_b32 a4, v16
978 ; GCN-NEXT: v_accvgpr_write_b32 a5, v17
979 ; GCN-NEXT: v_accvgpr_write_b32 a6, v18
980 ; GCN-NEXT: v_accvgpr_write_b32 a7, v19
981 ; GCN-NEXT: v_accvgpr_write_b32 a8, v20
982 ; GCN-NEXT: v_accvgpr_write_b32 a9, v21
983 ; GCN-NEXT: v_accvgpr_write_b32 a10, v22
984 ; GCN-NEXT: v_accvgpr_write_b32 a11, v23
985 ; GCN-NEXT: v_accvgpr_write_b32 a12, v24
986 ; GCN-NEXT: v_accvgpr_write_b32 a13, v25
987 ; GCN-NEXT: v_accvgpr_write_b32 a14, v26
988 ; GCN-NEXT: v_accvgpr_write_b32 a15, v27
990 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
993 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0
994 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1
995 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2
996 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3
997 ; GCN-NEXT: v_accvgpr_read_b32 v4, a4
998 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5
999 ; GCN-NEXT: v_accvgpr_read_b32 v6, a6
1000 ; GCN-NEXT: v_accvgpr_read_b32 v7, a7
1001 ; GCN-NEXT: v_accvgpr_read_b32 v8, a8
1002 ; GCN-NEXT: v_accvgpr_read_b32 v9, a9
1003 ; GCN-NEXT: v_accvgpr_read_b32 v10, a10
1004 ; GCN-NEXT: v_accvgpr_read_b32 v11, a11
1005 ; GCN-NEXT: v_accvgpr_read_b32 v12, a12
1006 ; GCN-NEXT: v_accvgpr_read_b32 v13, a13
1007 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14
1008 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15
1009 ; GCN-NEXT: s_setpc_b64 s[30:31]
1010 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1011 ret <16 x float> %result
1014 ; --------------------------------------------------------------------
1015 ; llvm.amdgcn.smfmac.i32.16x16x128.i8
1016 ; --------------------------------------------------------------------
1018 declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32)
1020 define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
1021 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1022 ; SDAG: ; %bb.0: ; %bb
1023 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1024 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1025 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1026 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1027 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1028 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1029 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
1030 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
1031 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
1032 ; SDAG-NEXT: v_mov_b32_e32 v12, s8
1033 ; SDAG-NEXT: v_mov_b32_e32 v13, s9
1034 ; SDAG-NEXT: v_mov_b32_e32 v14, s10
1035 ; SDAG-NEXT: v_mov_b32_e32 v15, s11
1036 ; SDAG-NEXT: v_mov_b32_e32 v0, s12
1037 ; SDAG-NEXT: v_mov_b32_e32 v1, s13
1038 ; SDAG-NEXT: v_mov_b32_e32 v2, s14
1039 ; SDAG-NEXT: v_mov_b32_e32 v3, s15
1040 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1041 ; SDAG-NEXT: v_mov_b32_e32 v4, s0
1042 ; SDAG-NEXT: v_mov_b32_e32 v5, s1
1043 ; SDAG-NEXT: v_mov_b32_e32 v6, s2
1044 ; SDAG-NEXT: v_mov_b32_e32 v7, s3
1045 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
1046 ; SDAG-NEXT: s_waitcnt vmcnt(0)
1047 ; SDAG-NEXT: s_nop 0
1048 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
1049 ; SDAG-NEXT: s_nop 6
1050 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
1051 ; SDAG-NEXT: s_endpgm
1053 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
1054 ; GISEL: ; %bb.0: ; %bb
1055 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1056 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1057 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1058 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1059 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
1060 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1061 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
1062 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
1063 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1064 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1065 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
1066 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
1067 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
1068 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
1069 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1070 ; GISEL-NEXT: v_mov_b32_e32 v16, s2
1071 ; GISEL-NEXT: s_waitcnt vmcnt(0)
1072 ; GISEL-NEXT: s_nop 0
1073 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
1074 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1075 ; GISEL-NEXT: s_nop 5
1076 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
1077 ; GISEL-NEXT: s_endpgm
1079 %id = call i32 @llvm.amdgcn.workitem.id.x()
1080 %gep = getelementptr <4 x i32>, ptr addrspace(1) %arg, i32 %id
1081 %in.1 = load <4 x i32>, ptr addrspace(1) %gep
1082 %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %a, <8 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
1083 store <4 x i32> %mai.1, ptr addrspace(1) %arg
1087 define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
1088 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
1090 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1091 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1092 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1093 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1094 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1095 ; SDAG-NEXT: s_nop 1
1096 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
1097 ; SDAG-NEXT: s_nop 6
1098 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1099 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1100 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1101 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1102 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1104 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
1106 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
1108 ; GISEL-NEXT: s_nop 6
1109 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1110 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1111 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1112 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1113 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1114 %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1115 ret <4 x i32> %result
1118 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
1119 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1121 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1122 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1123 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1124 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1125 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1126 ; SDAG-NEXT: s_nop 1
1127 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
1128 ; SDAG-NEXT: s_nop 6
1129 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1130 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1131 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1132 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1133 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1135 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
1137 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1138 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
1139 ; GISEL-NEXT: s_nop 6
1140 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1141 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1142 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1143 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1144 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1145 %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
1146 ret <4 x i32> %result
1149 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
1150 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1152 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1154 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1155 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1156 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1157 ; SDAG-NEXT: s_nop 1
1158 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
1159 ; SDAG-NEXT: s_nop 6
1160 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1161 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1162 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1163 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1164 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1166 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
1168 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1169 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
1170 ; GISEL-NEXT: s_nop 6
1171 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1172 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1173 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1174 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1175 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1176 %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
1177 ret <4 x i32> %result
1180 define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
1181 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1183 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1184 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
1185 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
1186 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
1187 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
1188 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
1189 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
1190 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
1191 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
1192 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
1193 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
1194 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
1195 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
1196 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
1197 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
1198 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
1199 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
1200 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
1201 ; SDAG-NEXT: s_nop 1
1202 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
1203 ; SDAG-NEXT: s_nop 6
1204 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1205 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1206 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1207 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1208 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1210 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
1212 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
1214 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
1215 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
1216 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
1217 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1218 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
1219 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
1220 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
1221 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
1222 ; GISEL-NEXT: s_nop 1
1223 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
1224 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1225 %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1226 ret <4 x i32> %result
1229 ; --------------------------------------------------------------------
1230 ; llvm.amdgcn.smfmac.i32.32x32x64.i8
1231 ; --------------------------------------------------------------------
1233 declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32>, <8 x i32>, <16 x i32>, i32, i32, i32)
1235 define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
1236 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
1237 ; SDAG: ; %bb.0: ; %bb
1238 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1239 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1240 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
1241 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1242 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
1243 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
1244 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
1245 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
1246 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1247 ; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
1248 ; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
1249 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1250 ; SDAG-NEXT: v_mov_b32_e32 v24, s8
1251 ; SDAG-NEXT: v_mov_b32_e32 v25, s9
1252 ; SDAG-NEXT: v_mov_b32_e32 v26, s10
1253 ; SDAG-NEXT: v_mov_b32_e32 v27, s11
1254 ; SDAG-NEXT: v_mov_b32_e32 v16, s12
1255 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
1256 ; SDAG-NEXT: v_mov_b32_e32 v18, s14
1257 ; SDAG-NEXT: v_mov_b32_e32 v19, s15
1258 ; SDAG-NEXT: v_mov_b32_e32 v20, s16
1259 ; SDAG-NEXT: v_mov_b32_e32 v21, s17
1260 ; SDAG-NEXT: v_mov_b32_e32 v22, s18
1261 ; SDAG-NEXT: v_mov_b32_e32 v23, s19
1262 ; SDAG-NEXT: v_mov_b32_e32 v28, s2
1263 ; SDAG-NEXT: s_waitcnt vmcnt(0)
1264 ; SDAG-NEXT: s_nop 0
1265 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
1266 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1267 ; SDAG-NEXT: s_nop 7
1268 ; SDAG-NEXT: s_nop 1
1269 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
1270 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
1271 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
1272 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
1273 ; SDAG-NEXT: s_endpgm
1275 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
1276 ; GISEL: ; %bb.0: ; %bb
1277 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1278 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1279 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
1280 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1281 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
1282 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
1283 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
1284 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
1285 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1286 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
1287 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
1288 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1289 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
1290 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
1291 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
1292 ; GISEL-NEXT: v_mov_b32_e32 v28, s2
1293 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
1294 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
1295 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
1296 ; GISEL-NEXT: s_waitcnt vmcnt(0)
1297 ; GISEL-NEXT: s_nop 0
1298 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
1299 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
1300 ; GISEL-NEXT: s_nop 7
1301 ; GISEL-NEXT: s_nop 1
1302 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
1303 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
1304 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
1305 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
1306 ; GISEL-NEXT: s_endpgm
1308 %id = call i32 @llvm.amdgcn.workitem.id.x()
1309 %gep = getelementptr <16 x i32>, ptr addrspace(1) %arg, i32 %id
1310 %in.1 = load <16 x i32>, ptr addrspace(1) %gep
1311 %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %a, <8 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
1312 store <16 x i32> %mai.1, ptr addrspace(1) %arg
1316 define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
1317 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
1319 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1321 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1322 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1323 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1324 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
1325 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
1326 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
1327 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
1328 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
1329 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
1330 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
1331 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
1332 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
1333 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
1334 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
1335 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
1336 ; SDAG-NEXT: s_nop 1
1337 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
1338 ; SDAG-NEXT: s_nop 7
1339 ; SDAG-NEXT: s_nop 2
1340 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1341 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1342 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1343 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1344 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
1345 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
1346 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
1347 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
1348 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
1349 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
1350 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
1351 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
1352 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
1353 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
1354 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
1355 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
1356 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1358 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
1360 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1361 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
1362 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
1363 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
1364 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
1365 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
1366 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
1367 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
1368 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
1369 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
1370 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
1371 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
1372 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
1373 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
1374 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
1375 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
1376 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
1377 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
1378 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
1379 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
1380 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
1381 ; GISEL-NEXT: s_nop 1
1382 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
1383 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1384 %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1385 ret <16 x i32> %result
1388 define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
1389 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
1391 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1392 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1393 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1394 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1395 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1396 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
1397 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
1398 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
1399 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
1400 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
1401 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
1402 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
1403 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
1404 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
1405 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
1406 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
1407 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
1408 ; SDAG-NEXT: s_nop 1
1409 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
1410 ; SDAG-NEXT: s_nop 7
1411 ; SDAG-NEXT: s_nop 2
1412 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1413 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1414 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1415 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1416 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
1417 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
1418 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
1419 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
1420 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
1421 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
1422 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
1423 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
1424 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
1425 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
1426 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
1427 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
1428 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1430 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
1432 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1433 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
1434 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
1435 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
1436 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
1437 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
1438 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
1439 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
1440 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
1441 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
1442 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
1443 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
1444 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
1445 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
1446 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
1447 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
1448 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
1449 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
1450 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
1451 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
1452 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
1453 ; GISEL-NEXT: s_nop 1
1454 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
1455 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1456 %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
1457 ret <16 x i32> %result
1460 define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
1461 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
1463 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1464 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1465 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1466 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1467 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1468 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
1469 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
1470 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
1471 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
1472 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
1473 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
1474 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
1475 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
1476 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
1477 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
1478 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
1479 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
1480 ; SDAG-NEXT: s_nop 1
1481 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
1482 ; SDAG-NEXT: s_nop 7
1483 ; SDAG-NEXT: s_nop 2
1484 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1485 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1486 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1487 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1488 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
1489 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
1490 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
1491 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
1492 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
1493 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
1494 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
1495 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
1496 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
1497 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
1498 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
1499 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
1500 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1502 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
1504 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1505 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
1506 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
1507 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
1508 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
1509 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
1510 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
1511 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
1512 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
1513 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
1514 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
1515 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
1516 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
1517 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
1518 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
1519 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
1520 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
1521 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
1522 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
1523 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
1524 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
1525 ; GISEL-NEXT: s_nop 1
1526 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
1527 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1528 %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
1529 ret <16 x i32> %result
1532 define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
1533 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
1535 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
1537 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
1538 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
1539 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
1540 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
1541 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
1542 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
1543 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
1544 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
1545 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
1546 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
1547 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
1548 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
1549 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
1550 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
1551 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
1552 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
1553 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
1554 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
1555 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
1556 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1557 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
1558 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
1559 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
1560 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
1561 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
1562 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
1563 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
1564 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
1565 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1566 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1567 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1568 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
1569 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
1570 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
1571 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
1572 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
1573 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
1574 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
1575 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
1576 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
1577 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
1578 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
1579 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
1580 ; SDAG-NEXT: s_nop 1
1581 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
1582 ; SDAG-NEXT: s_nop 7
1583 ; SDAG-NEXT: s_nop 2
1584 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1585 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1586 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1587 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1588 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
1589 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
1590 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
1591 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
1592 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
1593 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
1594 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
1595 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
1596 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
1597 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
1598 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
1599 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
1600 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1602 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
1604 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1605 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
1606 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
1607 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
1608 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
1609 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
1610 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
1611 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
1612 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
1613 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
1614 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
1615 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
1616 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
1617 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
1618 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
1619 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
1620 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
1621 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
1622 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
1623 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
1624 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
1625 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
1626 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
1627 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
1628 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
1629 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
1630 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
1631 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
1632 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
1633 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
1634 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
1635 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
1636 ; GISEL-NEXT: s_nop 1
1637 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16
1638 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1639 %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1640 ret <16 x i32> %result
1643 ; --------------------------------------------------------------------
1644 ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
1645 ; --------------------------------------------------------------------
1647 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
1649 define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
1650 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
1651 ; SDAG: ; %bb.0: ; %bb
1652 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1653 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1654 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1655 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1656 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1657 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1658 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
1659 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
1660 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
1661 ; SDAG-NEXT: v_mov_b32_e32 v12, s8
1662 ; SDAG-NEXT: v_mov_b32_e32 v13, s9
1663 ; SDAG-NEXT: v_mov_b32_e32 v14, s10
1664 ; SDAG-NEXT: v_mov_b32_e32 v15, s11
1665 ; SDAG-NEXT: v_mov_b32_e32 v0, s12
1666 ; SDAG-NEXT: v_mov_b32_e32 v1, s13
1667 ; SDAG-NEXT: v_mov_b32_e32 v2, s14
1668 ; SDAG-NEXT: v_mov_b32_e32 v3, s15
1669 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1670 ; SDAG-NEXT: v_mov_b32_e32 v4, s0
1671 ; SDAG-NEXT: v_mov_b32_e32 v5, s1
1672 ; SDAG-NEXT: v_mov_b32_e32 v6, s2
1673 ; SDAG-NEXT: v_mov_b32_e32 v7, s3
1674 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
1675 ; SDAG-NEXT: s_waitcnt vmcnt(0)
1676 ; SDAG-NEXT: s_nop 0
1677 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
1678 ; SDAG-NEXT: s_nop 6
1679 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
1680 ; SDAG-NEXT: s_endpgm
1682 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
1683 ; GISEL: ; %bb.0: ; %bb
1684 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1685 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1686 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1687 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1688 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
1689 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1690 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
1691 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
1692 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1693 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1694 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
1695 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
1696 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
1697 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
1698 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1699 ; GISEL-NEXT: v_mov_b32_e32 v16, s2
1700 ; GISEL-NEXT: s_waitcnt vmcnt(0)
1701 ; GISEL-NEXT: s_nop 0
1702 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
1703 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1704 ; GISEL-NEXT: s_nop 5
1705 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
1706 ; GISEL-NEXT: s_endpgm
1708 %id = call i32 @llvm.amdgcn.workitem.id.x()
1709 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
1710 %in.1 = load <4 x float>, ptr addrspace(1) %gep
1711 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
1712 store <4 x float> %mai.1, ptr addrspace(1) %arg
1716 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1717 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
1719 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1720 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1721 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1722 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1723 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1724 ; SDAG-NEXT: s_nop 1
1725 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
1726 ; SDAG-NEXT: s_nop 6
1727 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1728 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1729 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1730 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1731 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1733 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
1735 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1736 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
1737 ; GISEL-NEXT: s_nop 6
1738 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1739 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1740 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1741 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1742 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1743 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1744 ret <4 x float> %result
1747 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1748 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
1750 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1752 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1753 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1754 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1755 ; SDAG-NEXT: s_nop 1
1756 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
1757 ; SDAG-NEXT: s_nop 6
1758 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1759 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1760 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1761 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1762 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1764 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
1766 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1767 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
1768 ; GISEL-NEXT: s_nop 6
1769 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1770 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1771 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1772 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1773 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1774 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
1775 ret <4 x float> %result
1778 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1779 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
1781 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1782 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1783 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1784 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1785 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1786 ; SDAG-NEXT: s_nop 1
1787 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
1788 ; SDAG-NEXT: s_nop 6
1789 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1790 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1791 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1792 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1793 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1795 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
1797 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1798 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
1799 ; GISEL-NEXT: s_nop 6
1800 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1801 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1802 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1803 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1804 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1805 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
1806 ret <4 x float> %result
1809 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
1810 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
1812 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1813 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
1814 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
1815 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
1816 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
1817 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
1818 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
1819 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
1820 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
1821 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
1822 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
1823 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
1824 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
1825 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
1826 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
1827 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
1828 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
1829 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
1830 ; SDAG-NEXT: s_nop 1
1831 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
1832 ; SDAG-NEXT: s_nop 6
1833 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1834 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1835 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1836 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1837 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1839 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
1841 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1842 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
1843 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
1844 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
1845 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
1846 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1847 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
1848 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
1849 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
1850 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
1851 ; GISEL-NEXT: s_nop 1
1852 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
1853 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1854 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1855 ret <4 x float> %result
1858 ; --------------------------------------------------------------------
1859 ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
1860 ; --------------------------------------------------------------------
1862 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
1864 define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
1865 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
1866 ; SDAG: ; %bb.0: ; %bb
1867 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1868 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1869 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1870 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1871 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
1872 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1873 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
1874 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
1875 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
1876 ; SDAG-NEXT: v_mov_b32_e32 v12, s8
1877 ; SDAG-NEXT: v_mov_b32_e32 v13, s9
1878 ; SDAG-NEXT: v_mov_b32_e32 v14, s10
1879 ; SDAG-NEXT: v_mov_b32_e32 v15, s11
1880 ; SDAG-NEXT: v_mov_b32_e32 v0, s12
1881 ; SDAG-NEXT: v_mov_b32_e32 v1, s13
1882 ; SDAG-NEXT: v_mov_b32_e32 v2, s14
1883 ; SDAG-NEXT: v_mov_b32_e32 v3, s15
1884 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
1885 ; SDAG-NEXT: v_mov_b32_e32 v4, s0
1886 ; SDAG-NEXT: v_mov_b32_e32 v5, s1
1887 ; SDAG-NEXT: v_mov_b32_e32 v6, s2
1888 ; SDAG-NEXT: v_mov_b32_e32 v7, s3
1889 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
1890 ; SDAG-NEXT: s_waitcnt vmcnt(0)
1891 ; SDAG-NEXT: s_nop 0
1892 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
1893 ; SDAG-NEXT: s_nop 6
1894 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
1895 ; SDAG-NEXT: s_endpgm
1897 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
1898 ; GISEL: ; %bb.0: ; %bb
1899 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1900 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1901 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
1902 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1903 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
1904 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1905 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
1906 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
1907 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
1908 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
1909 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
1910 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
1911 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
1912 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
1913 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
1914 ; GISEL-NEXT: v_mov_b32_e32 v16, s2
1915 ; GISEL-NEXT: s_waitcnt vmcnt(0)
1916 ; GISEL-NEXT: s_nop 0
1917 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
1918 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
1919 ; GISEL-NEXT: s_nop 5
1920 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
1921 ; GISEL-NEXT: s_endpgm
1923 %id = call i32 @llvm.amdgcn.workitem.id.x()
1924 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
1925 %in.1 = load <4 x float>, ptr addrspace(1) %gep
1926 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
1927 store <4 x float> %mai.1, ptr addrspace(1) %arg
1931 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1932 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
1934 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1935 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1936 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1937 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1938 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1939 ; SDAG-NEXT: s_nop 1
1940 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
1941 ; SDAG-NEXT: s_nop 6
1942 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1943 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1944 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1945 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1946 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1948 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
1950 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1951 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
1952 ; GISEL-NEXT: s_nop 6
1953 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1954 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1955 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1956 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1957 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1958 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
1959 ret <4 x float> %result
1962 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1963 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
1965 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1966 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1967 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1968 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
1969 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
1970 ; SDAG-NEXT: s_nop 1
1971 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
1972 ; SDAG-NEXT: s_nop 6
1973 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
1974 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
1975 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
1976 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
1977 ; SDAG-NEXT: s_setpc_b64 s[30:31]
1979 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
1981 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1982 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
1983 ; GISEL-NEXT: s_nop 6
1984 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
1985 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
1986 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
1987 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
1988 ; GISEL-NEXT: s_setpc_b64 s[30:31]
1989 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
1990 ret <4 x float> %result
1993 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
1994 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
1996 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1997 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
1998 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
1999 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2000 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2001 ; SDAG-NEXT: s_nop 1
2002 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2003 ; SDAG-NEXT: s_nop 6
2004 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2005 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2006 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2007 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2008 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2010 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2012 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2013 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2014 ; GISEL-NEXT: s_nop 6
2015 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2016 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2017 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2018 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2019 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2020 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2021 ret <4 x float> %result
2024 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2025 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2027 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2028 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2029 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2030 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2031 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2032 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2033 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2034 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2035 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2036 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2037 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2038 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2039 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2040 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2041 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2042 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2043 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2044 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2045 ; SDAG-NEXT: s_nop 1
2046 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2047 ; SDAG-NEXT: s_nop 6
2048 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2049 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2050 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2051 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2052 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2054 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2056 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2057 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2058 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2059 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2060 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2061 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2062 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2063 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2064 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2065 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2066 ; GISEL-NEXT: s_nop 1
2067 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2068 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2069 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2070 ret <4 x float> %result
2073 ; --------------------------------------------------------------------
2074 ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
2075 ; --------------------------------------------------------------------
2077 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2079 define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2080 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2081 ; SDAG: ; %bb.0: ; %bb
2082 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2083 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2084 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2085 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2086 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2087 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2088 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2089 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2090 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2091 ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2092 ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2093 ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2094 ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2095 ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2096 ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2097 ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2098 ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2099 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2100 ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2101 ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2102 ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2103 ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2104 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2105 ; SDAG-NEXT: s_waitcnt vmcnt(0)
2106 ; SDAG-NEXT: s_nop 0
2107 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2108 ; SDAG-NEXT: s_nop 6
2109 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2110 ; SDAG-NEXT: s_endpgm
2112 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2113 ; GISEL: ; %bb.0: ; %bb
2114 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2115 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2116 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2117 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2118 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2119 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2120 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2121 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2122 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2123 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2124 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2125 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2126 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2127 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2128 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2129 ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2130 ; GISEL-NEXT: s_waitcnt vmcnt(0)
2131 ; GISEL-NEXT: s_nop 0
2132 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2133 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2134 ; GISEL-NEXT: s_nop 5
2135 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2136 ; GISEL-NEXT: s_endpgm
2138 %id = call i32 @llvm.amdgcn.workitem.id.x()
2139 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2140 %in.1 = load <4 x float>, ptr addrspace(1) %gep
2141 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2142 store <4 x float> %mai.1, ptr addrspace(1) %arg
2146 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2147 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2149 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2150 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2151 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2152 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2153 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2154 ; SDAG-NEXT: s_nop 1
2155 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
2156 ; SDAG-NEXT: s_nop 6
2157 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2158 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2159 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2160 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2161 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2163 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2165 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
2167 ; GISEL-NEXT: s_nop 6
2168 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2169 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2170 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2171 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2172 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2173 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2174 ret <4 x float> %result
2177 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2178 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2180 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2181 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2182 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2183 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2184 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2185 ; SDAG-NEXT: s_nop 1
2186 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2187 ; SDAG-NEXT: s_nop 6
2188 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2189 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2190 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2191 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2192 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2194 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2196 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2197 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2198 ; GISEL-NEXT: s_nop 6
2199 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2200 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2201 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2202 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2203 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2204 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2205 ret <4 x float> %result
2208 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2209 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2211 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2212 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2213 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2214 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2215 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2216 ; SDAG-NEXT: s_nop 1
2217 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2218 ; SDAG-NEXT: s_nop 6
2219 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2220 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2221 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2222 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2223 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2225 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2227 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2228 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2229 ; GISEL-NEXT: s_nop 6
2230 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2231 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2232 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2233 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2234 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2235 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2236 ret <4 x float> %result
2239 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2240 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2242 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2243 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2244 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2245 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2246 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2247 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2248 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2249 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2250 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2251 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2252 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2253 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2254 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2255 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2256 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2257 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2258 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2259 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2260 ; SDAG-NEXT: s_nop 1
2261 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
2262 ; SDAG-NEXT: s_nop 6
2263 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2264 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2265 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2266 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2267 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2269 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2271 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2272 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2273 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2274 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2275 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2276 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2277 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2278 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2279 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2280 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2281 ; GISEL-NEXT: s_nop 1
2282 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
2283 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2284 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2285 ret <4 x float> %result
2288 ; --------------------------------------------------------------------
2289 ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2290 ; --------------------------------------------------------------------
2292 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2294 define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2295 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2296 ; SDAG: ; %bb.0: ; %bb
2297 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2298 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2299 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2300 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2301 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2302 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2303 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2304 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2305 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2306 ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2307 ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2308 ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2309 ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2310 ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2311 ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2312 ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2313 ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2314 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2315 ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2316 ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2317 ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2318 ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2319 ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2320 ; SDAG-NEXT: s_waitcnt vmcnt(0)
2321 ; SDAG-NEXT: s_nop 0
2322 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2323 ; SDAG-NEXT: s_nop 6
2324 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2325 ; SDAG-NEXT: s_endpgm
2327 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2328 ; GISEL: ; %bb.0: ; %bb
2329 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2330 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2331 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2332 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2333 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2334 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2335 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2336 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2337 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2338 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2339 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2340 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2341 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2342 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2343 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2344 ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2345 ; GISEL-NEXT: s_waitcnt vmcnt(0)
2346 ; GISEL-NEXT: s_nop 0
2347 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2348 ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2349 ; GISEL-NEXT: s_nop 5
2350 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2351 ; GISEL-NEXT: s_endpgm
2353 %id = call i32 @llvm.amdgcn.workitem.id.x()
2354 %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2355 %in.1 = load <4 x float>, ptr addrspace(1) %gep
2356 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2357 store <4 x float> %mai.1, ptr addrspace(1) %arg
2361 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2362 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2364 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2365 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2366 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2367 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2368 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2369 ; SDAG-NEXT: s_nop 1
2370 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2371 ; SDAG-NEXT: s_nop 6
2372 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2373 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2374 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2375 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2376 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2378 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2380 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2381 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2382 ; GISEL-NEXT: s_nop 6
2383 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2384 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2385 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2386 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2387 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2388 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2389 ret <4 x float> %result
2392 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2393 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2395 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2396 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2397 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2398 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2399 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2400 ; SDAG-NEXT: s_nop 1
2401 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2402 ; SDAG-NEXT: s_nop 6
2403 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2404 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2405 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2406 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2407 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2409 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2411 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2412 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2413 ; GISEL-NEXT: s_nop 6
2414 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2415 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2416 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2417 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2418 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2419 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2420 ret <4 x float> %result
2423 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2424 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2426 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2427 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2428 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2429 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2430 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2431 ; SDAG-NEXT: s_nop 1
2432 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2433 ; SDAG-NEXT: s_nop 6
2434 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2435 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2436 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2437 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2438 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2440 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2442 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2443 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2444 ; GISEL-NEXT: s_nop 6
2445 ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2446 ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2447 ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2448 ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2449 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2450 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2451 ret <4 x float> %result
2454 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2455 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2457 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2458 ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2459 ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2460 ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2461 ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2462 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2463 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2464 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2465 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2466 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2467 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2468 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2469 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2470 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2471 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2472 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2473 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2474 ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2475 ; SDAG-NEXT: s_nop 1
2476 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2477 ; SDAG-NEXT: s_nop 6
2478 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2479 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2480 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2481 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2482 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2484 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2486 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2487 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2488 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2489 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2490 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2491 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2492 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2493 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2494 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2495 ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2496 ; GISEL-NEXT: s_nop 1
2497 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2498 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2499 %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2500 ret <4 x float> %result
2503 ; --------------------------------------------------------------------
2504 ; llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8
2505 ; --------------------------------------------------------------------
2507 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
2509 define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2510 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
2511 ; SDAG: ; %bb.0: ; %bb
2512 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2513 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2514 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
2515 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2516 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
2517 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
2518 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
2519 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
2520 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2521 ; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
2522 ; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2523 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2524 ; SDAG-NEXT: v_mov_b32_e32 v24, s8
2525 ; SDAG-NEXT: v_mov_b32_e32 v25, s9
2526 ; SDAG-NEXT: v_mov_b32_e32 v26, s10
2527 ; SDAG-NEXT: v_mov_b32_e32 v27, s11
2528 ; SDAG-NEXT: v_mov_b32_e32 v16, s12
2529 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
2530 ; SDAG-NEXT: v_mov_b32_e32 v18, s14
2531 ; SDAG-NEXT: v_mov_b32_e32 v19, s15
2532 ; SDAG-NEXT: v_mov_b32_e32 v20, s16
2533 ; SDAG-NEXT: v_mov_b32_e32 v21, s17
2534 ; SDAG-NEXT: v_mov_b32_e32 v22, s18
2535 ; SDAG-NEXT: v_mov_b32_e32 v23, s19
2536 ; SDAG-NEXT: v_mov_b32_e32 v28, s2
2537 ; SDAG-NEXT: s_waitcnt vmcnt(0)
2538 ; SDAG-NEXT: s_nop 0
2539 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
2540 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2541 ; SDAG-NEXT: s_nop 7
2542 ; SDAG-NEXT: s_nop 1
2543 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
2544 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
2545 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
2546 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
2547 ; SDAG-NEXT: s_endpgm
2549 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
2550 ; GISEL: ; %bb.0: ; %bb
2551 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2552 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2553 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
2554 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
2556 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
2557 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
2558 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
2559 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2560 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2561 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2562 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2563 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
2564 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
2565 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
2566 ; GISEL-NEXT: v_mov_b32_e32 v28, s2
2567 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
2568 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
2569 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
2570 ; GISEL-NEXT: s_waitcnt vmcnt(0)
2571 ; GISEL-NEXT: s_nop 0
2572 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
2573 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
2574 ; GISEL-NEXT: s_nop 7
2575 ; GISEL-NEXT: s_nop 1
2576 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
2577 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
2578 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
2579 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
2580 ; GISEL-NEXT: s_endpgm
2582 %id = call i32 @llvm.amdgcn.workitem.id.x()
2583 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
2584 %in.1 = load <16 x float>, ptr addrspace(1) %gep
2585 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
2586 store <16 x float> %mai.1, ptr addrspace(1) %arg
2590 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
2591 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
2593 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2594 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2595 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2596 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2597 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2598 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
2599 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
2600 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
2601 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
2602 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
2603 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
2604 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
2605 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
2606 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
2607 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
2608 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
2609 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
2610 ; SDAG-NEXT: s_nop 1
2611 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
2612 ; SDAG-NEXT: s_nop 7
2613 ; SDAG-NEXT: s_nop 2
2614 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2615 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2616 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2617 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2618 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
2619 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
2620 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
2621 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
2622 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
2623 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
2624 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
2625 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
2626 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
2627 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
2628 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
2629 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
2630 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2632 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
2634 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2635 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
2636 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
2637 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
2638 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
2639 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
2640 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
2641 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
2642 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
2643 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
2644 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
2645 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
2646 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
2647 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
2648 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
2649 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
2650 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
2651 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
2652 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
2653 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
2654 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
2655 ; GISEL-NEXT: s_nop 1
2656 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
2657 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2658 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2659 ret <16 x float> %result
2662 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
2663 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
2665 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2666 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2667 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2668 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2669 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2670 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
2671 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
2672 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
2673 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
2674 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
2675 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
2676 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
2677 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
2678 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
2679 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
2680 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
2681 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
2682 ; SDAG-NEXT: s_nop 1
2683 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
2684 ; SDAG-NEXT: s_nop 7
2685 ; SDAG-NEXT: s_nop 2
2686 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2687 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2688 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2689 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2690 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
2691 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
2692 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
2693 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
2694 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
2695 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
2696 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
2697 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
2698 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
2699 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
2700 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
2701 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
2702 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2704 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
2706 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2707 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
2708 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
2709 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
2710 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
2711 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
2712 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
2713 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
2714 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
2715 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
2716 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
2717 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
2718 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
2719 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
2720 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
2721 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
2722 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
2723 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
2724 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
2725 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
2726 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
2727 ; GISEL-NEXT: s_nop 1
2728 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
2729 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2730 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2731 ret <16 x float> %result
2734 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
2735 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
2737 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2738 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2739 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2740 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2741 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2742 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
2743 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
2744 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
2745 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
2746 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
2747 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
2748 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
2749 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
2750 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
2751 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
2752 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
2753 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
2754 ; SDAG-NEXT: s_nop 1
2755 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
2756 ; SDAG-NEXT: s_nop 7
2757 ; SDAG-NEXT: s_nop 2
2758 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2759 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2760 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2761 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2762 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
2763 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
2764 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
2765 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
2766 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
2767 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
2768 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
2769 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
2770 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
2771 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
2772 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
2773 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
2774 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2776 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
2778 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2779 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
2780 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
2781 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
2782 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
2783 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
2784 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
2785 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
2786 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
2787 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
2788 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
2789 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
2790 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
2791 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
2792 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
2793 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
2794 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
2795 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
2796 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
2797 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
2798 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
2799 ; GISEL-NEXT: s_nop 1
2800 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
2801 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2802 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2803 ret <16 x float> %result
2806 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
2807 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
2809 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2810 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
2811 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
2812 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
2813 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
2814 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
2815 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
2816 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
2817 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
2818 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
2819 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
2820 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
2821 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
2822 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
2823 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
2824 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
2825 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
2826 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
2827 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
2828 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
2829 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
2830 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2831 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2832 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2833 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2834 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2835 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2836 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2837 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2838 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2839 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2840 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2841 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2842 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
2843 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
2844 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
2845 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
2846 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
2847 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
2848 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
2849 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
2850 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
2851 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
2852 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
2853 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
2854 ; SDAG-NEXT: s_nop 1
2855 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
2856 ; SDAG-NEXT: s_nop 7
2857 ; SDAG-NEXT: s_nop 2
2858 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2859 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2860 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2861 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2862 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
2863 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
2864 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
2865 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
2866 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
2867 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
2868 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
2869 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
2870 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
2871 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
2872 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
2873 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
2874 ; SDAG-NEXT: s_setpc_b64 s[30:31]
2876 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
2878 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2879 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
2880 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
2881 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
2882 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
2883 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
2884 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
2885 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
2886 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
2887 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
2888 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
2889 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
2890 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
2891 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
2892 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
2893 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
2894 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
2895 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
2896 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
2897 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
2898 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
2899 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
2900 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
2901 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
2902 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
2903 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
2904 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
2905 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
2906 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
2907 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
2908 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
2909 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
2910 ; GISEL-NEXT: s_nop 1
2911 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16
2912 ; GISEL-NEXT: s_setpc_b64 s[30:31]
2913 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2914 ret <16 x float> %result
2917 ; --------------------------------------------------------------------
2918 ; llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8
2919 ; --------------------------------------------------------------------
2921 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
2923 define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2924 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
2925 ; SDAG: ; %bb.0: ; %bb
2926 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2927 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2928 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
2929 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2930 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
2931 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
2932 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
2933 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
2934 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2935 ; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
2936 ; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2937 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2938 ; SDAG-NEXT: v_mov_b32_e32 v24, s8
2939 ; SDAG-NEXT: v_mov_b32_e32 v25, s9
2940 ; SDAG-NEXT: v_mov_b32_e32 v26, s10
2941 ; SDAG-NEXT: v_mov_b32_e32 v27, s11
2942 ; SDAG-NEXT: v_mov_b32_e32 v16, s12
2943 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
2944 ; SDAG-NEXT: v_mov_b32_e32 v18, s14
2945 ; SDAG-NEXT: v_mov_b32_e32 v19, s15
2946 ; SDAG-NEXT: v_mov_b32_e32 v20, s16
2947 ; SDAG-NEXT: v_mov_b32_e32 v21, s17
2948 ; SDAG-NEXT: v_mov_b32_e32 v22, s18
2949 ; SDAG-NEXT: v_mov_b32_e32 v23, s19
2950 ; SDAG-NEXT: v_mov_b32_e32 v28, s2
2951 ; SDAG-NEXT: s_waitcnt vmcnt(0)
2952 ; SDAG-NEXT: s_nop 0
2953 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
2954 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2955 ; SDAG-NEXT: s_nop 7
2956 ; SDAG-NEXT: s_nop 1
2957 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
2958 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
2959 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
2960 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
2961 ; SDAG-NEXT: s_endpgm
2963 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
2964 ; GISEL: ; %bb.0: ; %bb
2965 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2966 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2967 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
2968 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2969 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
2970 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
2971 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
2972 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
2973 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2974 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2975 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2976 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2977 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
2978 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
2979 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
2980 ; GISEL-NEXT: v_mov_b32_e32 v28, s2
2981 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
2982 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
2983 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
2984 ; GISEL-NEXT: s_waitcnt vmcnt(0)
2985 ; GISEL-NEXT: s_nop 0
2986 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
2987 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
2988 ; GISEL-NEXT: s_nop 7
2989 ; GISEL-NEXT: s_nop 1
2990 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
2991 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
2992 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
2993 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
2994 ; GISEL-NEXT: s_endpgm
2996 %id = call i32 @llvm.amdgcn.workitem.id.x()
2997 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
2998 %in.1 = load <16 x float>, ptr addrspace(1) %gep
2999 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
3000 store <16 x float> %mai.1, ptr addrspace(1) %arg
3004 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3005 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
3007 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3008 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3009 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3010 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3011 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3012 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3013 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3014 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3015 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3016 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3017 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3018 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3019 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3020 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3021 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3022 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3023 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3024 ; SDAG-NEXT: s_nop 1
3025 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
3026 ; SDAG-NEXT: s_nop 7
3027 ; SDAG-NEXT: s_nop 2
3028 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3029 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3030 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3031 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3032 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3033 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3034 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3035 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3036 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3037 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3038 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3039 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3040 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3041 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3042 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3043 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3044 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3046 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
3048 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3049 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3050 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3051 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3052 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3053 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3054 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3055 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3056 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3057 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3058 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3059 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3060 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3061 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3062 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3063 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3064 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3065 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3066 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3067 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3068 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3069 ; GISEL-NEXT: s_nop 1
3070 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
3071 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3072 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
3073 ret <16 x float> %result
3076 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3077 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
3079 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3080 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3081 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3082 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3083 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3084 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3085 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3086 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3087 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3088 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3089 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3090 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3091 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3092 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3093 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3094 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3095 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3096 ; SDAG-NEXT: s_nop 1
3097 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
3098 ; SDAG-NEXT: s_nop 7
3099 ; SDAG-NEXT: s_nop 2
3100 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3101 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3102 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3103 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3104 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3105 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3106 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3107 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3108 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3109 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3110 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3111 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3112 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3113 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3114 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3115 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3116 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3118 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
3120 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3121 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3122 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3123 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3124 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3125 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3126 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3127 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3128 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3129 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3130 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3131 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3132 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3133 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3134 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3135 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3136 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3137 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3138 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3139 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3140 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3141 ; GISEL-NEXT: s_nop 1
3142 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
3143 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3144 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
3145 ret <16 x float> %result
3148 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3149 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
3151 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3153 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3154 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3155 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3156 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3157 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3158 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3159 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3160 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3161 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3162 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3163 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3164 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3165 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3166 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3167 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3168 ; SDAG-NEXT: s_nop 1
3169 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
3170 ; SDAG-NEXT: s_nop 7
3171 ; SDAG-NEXT: s_nop 2
3172 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3173 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3174 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3175 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3176 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3177 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3178 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3179 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3180 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3181 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3182 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3183 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3184 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3185 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3186 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3187 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3188 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3190 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
3192 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3193 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3194 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3195 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3196 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3197 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3198 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3199 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3200 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3201 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3202 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3203 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3204 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3205 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3206 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3207 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3208 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3209 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3210 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3211 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3212 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3213 ; GISEL-NEXT: s_nop 1
3214 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
3215 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3216 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
3217 ret <16 x float> %result
3220 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
3221 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
3223 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3224 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
3225 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
3226 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
3227 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
3228 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
3229 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
3230 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
3231 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
3232 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
3233 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
3234 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
3235 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
3236 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
3237 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
3238 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
3239 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
3240 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
3241 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
3242 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
3243 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
3244 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3245 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
3246 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
3247 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
3248 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
3249 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
3250 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
3251 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
3252 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
3253 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3254 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3255 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3256 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3257 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3258 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3259 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3260 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3261 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3262 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3263 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3264 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3265 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3266 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3267 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3268 ; SDAG-NEXT: s_nop 1
3269 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
3270 ; SDAG-NEXT: s_nop 7
3271 ; SDAG-NEXT: s_nop 2
3272 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3273 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3274 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3275 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3276 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3277 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3278 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3279 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3280 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3281 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3282 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3283 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3284 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3285 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3286 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3287 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3288 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3290 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
3292 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3293 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
3294 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
3295 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
3296 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
3297 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
3298 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
3299 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
3300 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
3301 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
3302 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
3303 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
3304 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
3305 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
3306 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
3307 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
3308 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
3309 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
3310 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
3311 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
3312 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
3313 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
3314 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
3315 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
3316 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
3317 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
3318 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
3319 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
3320 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
3321 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
3322 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
3323 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
3324 ; GISEL-NEXT: s_nop 1
3325 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16
3326 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3327 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
3328 ret <16 x float> %result
3331 ; --------------------------------------------------------------------
3332 ; llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8
3333 ; --------------------------------------------------------------------
3335 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
3337 define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
3338 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
3339 ; SDAG: ; %bb.0: ; %bb
3340 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3341 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3342 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
3343 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
3344 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
3345 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
3346 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
3347 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
3348 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
3349 ; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
3350 ; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
3351 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
3352 ; SDAG-NEXT: v_mov_b32_e32 v24, s8
3353 ; SDAG-NEXT: v_mov_b32_e32 v25, s9
3354 ; SDAG-NEXT: v_mov_b32_e32 v26, s10
3355 ; SDAG-NEXT: v_mov_b32_e32 v27, s11
3356 ; SDAG-NEXT: v_mov_b32_e32 v16, s12
3357 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
3358 ; SDAG-NEXT: v_mov_b32_e32 v18, s14
3359 ; SDAG-NEXT: v_mov_b32_e32 v19, s15
3360 ; SDAG-NEXT: v_mov_b32_e32 v20, s16
3361 ; SDAG-NEXT: v_mov_b32_e32 v21, s17
3362 ; SDAG-NEXT: v_mov_b32_e32 v22, s18
3363 ; SDAG-NEXT: v_mov_b32_e32 v23, s19
3364 ; SDAG-NEXT: v_mov_b32_e32 v28, s2
3365 ; SDAG-NEXT: s_waitcnt vmcnt(0)
3366 ; SDAG-NEXT: s_nop 0
3367 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
3368 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
3369 ; SDAG-NEXT: s_nop 7
3370 ; SDAG-NEXT: s_nop 1
3371 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3372 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3373 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3374 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3375 ; SDAG-NEXT: s_endpgm
3377 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
3378 ; GISEL: ; %bb.0: ; %bb
3379 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3380 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3381 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
3382 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
3383 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
3384 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
3385 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
3386 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
3387 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
3388 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
3389 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
3390 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
3391 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
3392 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
3393 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
3394 ; GISEL-NEXT: v_mov_b32_e32 v28, s2
3395 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
3396 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
3397 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
3398 ; GISEL-NEXT: s_waitcnt vmcnt(0)
3399 ; GISEL-NEXT: s_nop 0
3400 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
3401 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
3402 ; GISEL-NEXT: s_nop 7
3403 ; GISEL-NEXT: s_nop 1
3404 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3405 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3406 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3407 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3408 ; GISEL-NEXT: s_endpgm
3410 %id = call i32 @llvm.amdgcn.workitem.id.x()
3411 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
3412 %in.1 = load <16 x float>, ptr addrspace(1) %gep
3413 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
3414 store <16 x float> %mai.1, ptr addrspace(1) %arg
3418 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3419 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
3421 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3422 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3423 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3424 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3425 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3426 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3427 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3428 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3429 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3430 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3431 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3432 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3433 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3434 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3435 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3436 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3437 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3438 ; SDAG-NEXT: s_nop 1
3439 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
3440 ; SDAG-NEXT: s_nop 7
3441 ; SDAG-NEXT: s_nop 2
3442 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3443 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3444 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3445 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3446 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3447 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3448 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3449 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3450 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3451 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3452 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3453 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3454 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3455 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3456 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3457 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3458 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3460 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
3462 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3463 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3464 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3465 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3466 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3467 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3468 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3469 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3470 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3471 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3472 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3473 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3474 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3475 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3476 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3477 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3478 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3479 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3480 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3481 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3482 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3483 ; GISEL-NEXT: s_nop 1
3484 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
3485 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3486 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
3487 ret <16 x float> %result
3490 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3491 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
3493 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3494 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3495 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3496 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3497 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3498 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3499 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3500 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3501 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3502 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3503 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3504 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3505 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3506 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3507 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3508 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3509 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3510 ; SDAG-NEXT: s_nop 1
3511 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
3512 ; SDAG-NEXT: s_nop 7
3513 ; SDAG-NEXT: s_nop 2
3514 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3515 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3516 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3517 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3518 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3519 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3520 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3521 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3522 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3523 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3524 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3525 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3526 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3527 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3528 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3529 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3530 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3532 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
3534 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3536 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3537 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3538 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3539 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3540 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3541 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3542 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3543 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3544 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3545 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3546 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3547 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3548 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3549 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3550 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3551 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3552 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3553 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3554 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3555 ; GISEL-NEXT: s_nop 1
3556 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
3557 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3558 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
3559 ret <16 x float> %result
3562 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3563 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
3565 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3566 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3567 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3568 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3569 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3570 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3571 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3572 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3573 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3574 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3575 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3576 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3577 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3578 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3579 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3580 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3581 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3582 ; SDAG-NEXT: s_nop 1
3583 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
3584 ; SDAG-NEXT: s_nop 7
3585 ; SDAG-NEXT: s_nop 2
3586 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3587 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3588 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3589 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3590 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3591 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3592 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3593 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3594 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3595 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3596 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3597 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3598 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3599 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3600 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3601 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3602 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3604 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
3606 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3607 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3608 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3609 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3610 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3611 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3612 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3613 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3614 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3615 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3616 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3617 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3618 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3619 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3620 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3621 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3622 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3623 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3624 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3625 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3626 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3627 ; GISEL-NEXT: s_nop 1
3628 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
3629 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3630 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
3631 ret <16 x float> %result
3634 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
3635 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
3637 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3638 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
3639 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
3640 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
3641 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
3642 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
3643 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
3644 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
3645 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
3646 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
3647 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
3648 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
3649 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
3650 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
3651 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
3652 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
3653 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
3654 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
3655 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
3656 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
3657 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
3658 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3659 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
3660 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
3661 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
3662 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
3663 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
3664 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
3665 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
3666 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
3667 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3668 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3669 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3670 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3671 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3672 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3673 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3674 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3675 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3676 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3677 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3678 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3679 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3680 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3681 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3682 ; SDAG-NEXT: s_nop 1
3683 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
3684 ; SDAG-NEXT: s_nop 7
3685 ; SDAG-NEXT: s_nop 2
3686 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3687 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3688 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3689 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3690 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3691 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3692 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3693 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3694 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3695 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3696 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3697 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3698 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3699 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3700 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3701 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3702 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3704 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
3706 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3707 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
3708 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
3709 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
3710 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
3711 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
3712 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
3713 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
3714 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
3715 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
3716 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
3717 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
3718 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
3719 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
3720 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
3721 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
3722 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
3723 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
3724 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
3725 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
3726 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
3727 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
3728 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
3729 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
3730 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
3731 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
3732 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
3733 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
3734 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
3735 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
3736 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
3737 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
3738 ; GISEL-NEXT: s_nop 1
3739 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16
3740 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3741 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
3742 ret <16 x float> %result
3745 ; --------------------------------------------------------------------
3746 ; llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8
3747 ; --------------------------------------------------------------------
3749 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)
3751 define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
3752 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
3753 ; SDAG: ; %bb.0: ; %bb
3754 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3755 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3756 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0
3757 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
3758 ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
3759 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
3760 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
3761 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
3762 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
3763 ; SDAG-NEXT: s_load_dword s2, s[4:5], 0x64
3764 ; SDAG-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
3765 ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
3766 ; SDAG-NEXT: v_mov_b32_e32 v24, s8
3767 ; SDAG-NEXT: v_mov_b32_e32 v25, s9
3768 ; SDAG-NEXT: v_mov_b32_e32 v26, s10
3769 ; SDAG-NEXT: v_mov_b32_e32 v27, s11
3770 ; SDAG-NEXT: v_mov_b32_e32 v16, s12
3771 ; SDAG-NEXT: v_mov_b32_e32 v17, s13
3772 ; SDAG-NEXT: v_mov_b32_e32 v18, s14
3773 ; SDAG-NEXT: v_mov_b32_e32 v19, s15
3774 ; SDAG-NEXT: v_mov_b32_e32 v20, s16
3775 ; SDAG-NEXT: v_mov_b32_e32 v21, s17
3776 ; SDAG-NEXT: v_mov_b32_e32 v22, s18
3777 ; SDAG-NEXT: v_mov_b32_e32 v23, s19
3778 ; SDAG-NEXT: v_mov_b32_e32 v28, s2
3779 ; SDAG-NEXT: s_waitcnt vmcnt(0)
3780 ; SDAG-NEXT: s_nop 0
3781 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
3782 ; SDAG-NEXT: v_mov_b32_e32 v16, 0
3783 ; SDAG-NEXT: s_nop 7
3784 ; SDAG-NEXT: s_nop 1
3785 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3786 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3787 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3788 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3789 ; SDAG-NEXT: s_endpgm
3791 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
3792 ; GISEL: ; %bb.0: ; %bb
3793 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3794 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3795 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0
3796 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
3797 ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
3798 ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
3799 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
3800 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
3801 ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
3802 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
3803 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
3804 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
3805 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
3806 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
3807 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
3808 ; GISEL-NEXT: v_mov_b32_e32 v28, s2
3809 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
3810 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
3811 ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
3812 ; GISEL-NEXT: s_waitcnt vmcnt(0)
3813 ; GISEL-NEXT: s_nop 0
3814 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
3815 ; GISEL-NEXT: v_mov_b32_e32 v16, 0
3816 ; GISEL-NEXT: s_nop 7
3817 ; GISEL-NEXT: s_nop 1
3818 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3819 ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3820 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3821 ; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3822 ; GISEL-NEXT: s_endpgm
3824 %id = call i32 @llvm.amdgcn.workitem.id.x()
3825 %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
3826 %in.1 = load <16 x float>, ptr addrspace(1) %gep
3827 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
3828 store <16 x float> %mai.1, ptr addrspace(1) %arg
3832 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3833 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
3835 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3836 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3837 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3838 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3839 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3840 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3841 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3842 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3843 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3844 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3845 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3846 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3847 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3848 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3849 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3850 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3851 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3852 ; SDAG-NEXT: s_nop 1
3853 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
3854 ; SDAG-NEXT: s_nop 7
3855 ; SDAG-NEXT: s_nop 2
3856 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3857 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3858 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3859 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3860 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3861 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3862 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3863 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3864 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3865 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3866 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3867 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3868 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3869 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3870 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3871 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3872 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3874 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
3876 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3877 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3878 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3879 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3880 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3881 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3882 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3883 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3884 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3885 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3886 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3887 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3888 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3889 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3890 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3891 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3892 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3893 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3894 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3895 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3896 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3897 ; GISEL-NEXT: s_nop 1
3898 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
3899 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3900 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
3901 ret <16 x float> %result
3904 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3905 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
3907 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3908 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3909 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3910 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3911 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3912 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3913 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3914 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3915 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3916 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3917 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3918 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3919 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3920 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3921 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3922 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3923 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3924 ; SDAG-NEXT: s_nop 1
3925 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
3926 ; SDAG-NEXT: s_nop 7
3927 ; SDAG-NEXT: s_nop 2
3928 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
3929 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
3930 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
3931 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
3932 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
3933 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
3934 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
3935 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
3936 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
3937 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
3938 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
3939 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
3940 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
3941 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
3942 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
3943 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
3944 ; SDAG-NEXT: s_setpc_b64 s[30:31]
3946 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
3948 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3949 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
3950 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
3951 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
3952 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
3953 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
3954 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
3955 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
3956 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
3957 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
3958 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
3959 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
3960 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
3961 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
3962 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
3963 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
3964 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
3965 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
3966 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
3967 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
3968 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
3969 ; GISEL-NEXT: s_nop 1
3970 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
3971 ; GISEL-NEXT: s_setpc_b64 s[30:31]
3972 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
3973 ret <16 x float> %result
3976 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
3977 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
3979 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3980 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
3981 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
3982 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
3983 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
3984 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
3985 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
3986 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
3987 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
3988 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
3989 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
3990 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
3991 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
3992 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
3993 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
3994 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
3995 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
3996 ; SDAG-NEXT: s_nop 1
3997 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
3998 ; SDAG-NEXT: s_nop 7
3999 ; SDAG-NEXT: s_nop 2
4000 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
4001 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
4002 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
4003 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
4004 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
4005 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
4006 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
4007 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
4008 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
4009 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
4010 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
4011 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
4012 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
4013 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
4014 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
4015 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
4016 ; SDAG-NEXT: s_setpc_b64 s[30:31]
4018 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
4020 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4021 ; GISEL-NEXT: v_mov_b32_e32 v48, v0
4022 ; GISEL-NEXT: v_mov_b32_e32 v49, v1
4023 ; GISEL-NEXT: v_mov_b32_e32 v50, v2
4024 ; GISEL-NEXT: v_mov_b32_e32 v51, v3
4025 ; GISEL-NEXT: v_mov_b32_e32 v30, v4
4026 ; GISEL-NEXT: v_mov_b32_e32 v31, v5
4027 ; GISEL-NEXT: v_mov_b32_e32 v32, v6
4028 ; GISEL-NEXT: v_mov_b32_e32 v33, v7
4029 ; GISEL-NEXT: v_mov_b32_e32 v34, v8
4030 ; GISEL-NEXT: v_mov_b32_e32 v35, v9
4031 ; GISEL-NEXT: v_mov_b32_e32 v36, v10
4032 ; GISEL-NEXT: v_mov_b32_e32 v37, v11
4033 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
4034 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
4035 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
4036 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
4037 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
4038 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
4039 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
4040 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
4041 ; GISEL-NEXT: s_nop 1
4042 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
4043 ; GISEL-NEXT: s_setpc_b64 s[30:31]
4044 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
4045 ret <16 x float> %result
4048 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
4049 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
4051 ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4052 ; SDAG-NEXT: v_mov_b32_e32 v28, s0
4053 ; SDAG-NEXT: v_mov_b32_e32 v29, s1
4054 ; SDAG-NEXT: v_mov_b32_e32 v30, s2
4055 ; SDAG-NEXT: v_mov_b32_e32 v31, s3
4056 ; SDAG-NEXT: v_mov_b32_e32 v12, s24
4057 ; SDAG-NEXT: v_mov_b32_e32 v27, v9
4058 ; SDAG-NEXT: v_mov_b32_e32 v26, v8
4059 ; SDAG-NEXT: v_mov_b32_e32 v25, v7
4060 ; SDAG-NEXT: v_mov_b32_e32 v24, v6
4061 ; SDAG-NEXT: v_mov_b32_e32 v23, v5
4062 ; SDAG-NEXT: v_mov_b32_e32 v22, v4
4063 ; SDAG-NEXT: v_mov_b32_e32 v21, v3
4064 ; SDAG-NEXT: v_mov_b32_e32 v20, v2
4065 ; SDAG-NEXT: v_mov_b32_e32 v19, v1
4066 ; SDAG-NEXT: v_mov_b32_e32 v18, v0
4067 ; SDAG-NEXT: v_mov_b32_e32 v13, s25
4068 ; SDAG-NEXT: v_mov_b32_e32 v14, s26
4069 ; SDAG-NEXT: v_mov_b32_e32 v15, s27
4070 ; SDAG-NEXT: v_mov_b32_e32 v16, s28
4071 ; SDAG-NEXT: v_mov_b32_e32 v17, s29
4072 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
4073 ; SDAG-NEXT: v_mov_b32_e32 v0, s16
4074 ; SDAG-NEXT: v_mov_b32_e32 v1, s17
4075 ; SDAG-NEXT: v_mov_b32_e32 v2, s18
4076 ; SDAG-NEXT: v_mov_b32_e32 v3, s19
4077 ; SDAG-NEXT: v_mov_b32_e32 v4, s20
4078 ; SDAG-NEXT: v_mov_b32_e32 v5, s21
4079 ; SDAG-NEXT: v_mov_b32_e32 v6, s22
4080 ; SDAG-NEXT: v_mov_b32_e32 v7, s23
4081 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
4082 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
4083 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
4084 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
4085 ; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
4086 ; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
4087 ; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
4088 ; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
4089 ; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
4090 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
4091 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
4092 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
4093 ; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
4094 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
4095 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
4096 ; SDAG-NEXT: s_nop 1
4097 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
4098 ; SDAG-NEXT: s_nop 7
4099 ; SDAG-NEXT: s_nop 2
4100 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
4101 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
4102 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
4103 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
4104 ; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
4105 ; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
4106 ; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
4107 ; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
4108 ; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
4109 ; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
4110 ; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
4111 ; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
4112 ; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
4113 ; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
4114 ; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
4115 ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
4116 ; SDAG-NEXT: s_setpc_b64 s[30:31]
4118 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
4120 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4121 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
4122 ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
4123 ; GISEL-NEXT: v_mov_b32_e32 v18, s24
4124 ; GISEL-NEXT: v_mov_b32_e32 v19, s25
4125 ; GISEL-NEXT: v_mov_b32_e32 v24, v0
4126 ; GISEL-NEXT: v_mov_b32_e32 v25, v1
4127 ; GISEL-NEXT: v_mov_b32_e32 v26, v2
4128 ; GISEL-NEXT: v_mov_b32_e32 v27, v3
4129 ; GISEL-NEXT: v_mov_b32_e32 v28, v4
4130 ; GISEL-NEXT: v_mov_b32_e32 v29, v5
4131 ; GISEL-NEXT: v_mov_b32_e32 v30, v6
4132 ; GISEL-NEXT: v_mov_b32_e32 v31, v7
4133 ; GISEL-NEXT: v_mov_b32_e32 v32, v8
4134 ; GISEL-NEXT: v_mov_b32_e32 v33, v9
4135 ; GISEL-NEXT: v_mov_b32_e32 v16, v10
4136 ; GISEL-NEXT: v_mov_b32_e32 v20, s26
4137 ; GISEL-NEXT: v_mov_b32_e32 v21, s27
4138 ; GISEL-NEXT: v_mov_b32_e32 v22, s28
4139 ; GISEL-NEXT: v_mov_b32_e32 v23, s29
4140 ; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
4141 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
4142 ; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
4143 ; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
4144 ; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
4145 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
4146 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
4147 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
4148 ; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
4149 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
4150 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
4151 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
4152 ; GISEL-NEXT: s_nop 1
4153 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16
4154 ; GISEL-NEXT: s_setpc_b64 s[30:31]
4155 %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
4156 ret <16 x float> %result
4159 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }