1 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s
5 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
7 ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
9 ; Check that we do not copy agprs to vgprs and back inside the loop.
11 ; GCN: [[LOOP:.LBB[0-9_]+]]:
13 ; GFX908_A: v_mfma_f32_32x32x1f32
14 ; GFX940: v_mfma_f32_32x32x1_2b_f32
16 ; GCN: s_cbranch_scc1 [[LOOP]]
18 ; Final result should be read only once after the loop.
20 ; GFX908-COUNT-32: v_accvgpr_read_b32
21 ; GFX90A-NOT: v_accvgpr_read_b32
22 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
23 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
25 define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
27 br label %for.cond.preheader
30 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
31 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
32 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
33 %inc = add nuw nsw i32 %c, 1
34 %cc = icmp eq i32 %inc, 16
35 br i1 %cc, label %exit, label %for.cond.preheader
38 store <32 x float> %mai.1, ptr addrspace(1) %arg
42 ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
44 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
45 ; 3 vgprs are needed to avoid wait states between writes.
46 ; Check that we do not use 32 temp sgprs as well.
48 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
49 ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
51 ; GCN: [[LOOP:.LBB[0-9_]+]]:
53 ; GFX908_A: v_mfma_f32_32x32x1f32
54 ; GFX940: v_mfma_f32_32x32x1_2b_f32
56 ; GCN: s_cbranch_scc1 [[LOOP]]
58 ; GFX908-COUNT-32: v_accvgpr_read_b32
59 ; GFX90A-NOT: v_accvgpr_read_b32
60 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
61 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
63 define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 {
65 br label %for.cond.preheader
68 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
69 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
70 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
71 %inc = add nuw nsw i32 %c, 1
72 %cc = icmp eq i32 %inc, 16
73 br i1 %cc, label %exit, label %for.cond.preheader
76 store <32 x float> %mai.1, ptr addrspace(1) %arg
80 ; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
82 ; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
83 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
85 ; GCN: [[LOOP:.LBB[0-9_]+]]:
87 ; GFX908_A: v_mfma_f32_32x32x1f32
88 ; GFX940: v_mfma_f32_32x32x1_2b_f32
90 ; GCN: s_cbranch_scc1 [[LOOP]]
92 ; GFX908-COUNT-32: v_accvgpr_read_b32
93 ; GFX90A-NOT: v_accvgpr_read_b32
94 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
95 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
97 define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
99 br label %for.cond.preheader
102 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
103 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
104 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
105 %inc = add nuw nsw i32 %c, 1
106 %cc = icmp eq i32 %inc, 16
107 br i1 %cc, label %exit, label %for.cond.preheader
110 store <32 x float> %mai.1, ptr addrspace(1) %arg
114 ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
116 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
117 ; 3 vgprs are needed to avoid wait states between writes.
119 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
120 ; GFX-908: v_mov_b32_e32 v0, 0x42f80000
122 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
123 ; GFX-908: v_mov_b32_e32 v0, 0x42fa0000
125 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
126 ; GFX-908: v_mov_b32_e32 v0, 0x42fc0000
128 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
129 ; GFX-908: v_mov_b32_e32 v0, 0x42fe0000
131 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
132 ; GFX-908: v_mov_b32_e32 v0, 0x43000000
134 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
135 ; GFX-908: v_mov_b32_e32 v0, 0x43010000
137 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
138 ; GFX-908: v_mov_b32_e32 v0, 0x43020000
140 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
141 ; GFX-908: v_mov_b32_e32 v0, 0x43030000
143 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
144 ; GFX-908: v_mov_b32_e32 v0, 0x43040000
146 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
147 ; GFX-908: v_mov_b32_e32 v0, 0x43050000
149 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
150 ; GFX-908: v_mov_b32_e32 v0, 0x43060000
152 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
153 ; GFX-908: v_mov_b32_e32 v0, 0x43070000
155 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
156 ; GFX-908: v_mov_b32_e32 v0, 0x43080000
158 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
159 ; GFX-908: v_mov_b32_e32 v0, 0x43090000
161 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
162 ; GFX-908: v_mov_b32_e32 v0, 0x430a0000
164 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
165 ; GFX-908: v_mov_b32_e32 v0, 0x430b0000
167 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
168 ; GFX-908: v_mov_b32_e32 v0, 0x430c0000
170 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
171 ; GFX-908: v_mov_b32_e32 v0, 0x430d0000
173 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
174 ; GFX-908: v_mov_b32_e32 v0, 0x430e0000
176 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
177 ; GFX-908: v_mov_b32_e32 v0, 0x430f0000
179 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
180 ; GFX-908: v_mov_b32_e32 v0, 0x43100000
182 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
183 ; GFX-908: v_mov_b32_e32 v0, 0x43110000
185 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
186 ; GFX-908: v_mov_b32_e32 v0, 0x43120000
188 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
189 ; GFX-908: v_mov_b32_e32 v0, 0x43130000
191 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
192 ; GFX-908: v_mov_b32_e32 v0, 0x43140000
194 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
195 ; GFX-908: v_mov_b32_e32 v0, 0x43150000
197 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
198 ; GFX-908: v_mov_b32_e32 v0, 0x43160000
200 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
201 ; GFX-908: v_mov_b32_e32 v0, 0x43170000
203 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
204 ; GFX-908: v_mov_b32_e32 v0, 0x43180000
206 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
207 ; GFX-908: v_mov_b32_e32 v0, 0x43190000
209 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
210 ; GFX-908: v_mov_b32_e32 v0, 0x431a0000
212 ; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
214 ; FIXME: Constant is now in VGPR instead of SGPR.
216 ; GFX940_A: v_mov_b32_e32 v{{[0-9]+}}, 0x4{{[0-9a-f]+}}
217 ; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
219 ; GCN: [[LOOP:.LBB[0-9_]+]]:
221 ; GFX908_A: v_mfma_f32_32x32x1f32
222 ; GFX940: v_mfma_f32_32x32x1_2b_f32
224 ; GCN: s_cbranch_scc1 [[LOOP]]
226 ; GFX908-COUNT-32: v_accvgpr_read_b32
227 ; GFX90A-NOT: v_accvgpr_read_b32
228 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
229 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
231 define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 {
233 br label %for.cond.preheader
236 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
237 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
238 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
239 %inc = add nuw nsw i32 %c, 1
240 %cc = icmp eq i32 %inc, 16
241 br i1 %cc, label %exit, label %for.cond.preheader
244 store <32 x float> %mai.1, ptr addrspace(1) %arg
248 ; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
250 ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
252 ; GCN: [[LOOP:.LBB[0-9_]+]]:
254 ; GFX908_A: v_mfma_f32_32x32x1f32
255 ; GFX940: v_mfma_f32_32x32x1_2b_f32
257 ; GCN: s_cbranch_scc1 [[LOOP]]
259 ; GFX908-COUNT-32: v_accvgpr_read_b32
260 ; GFX90A-NOT: v_accvgpr_read_b32
261 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
262 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
264 define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
266 %tid = call i32 @llvm.amdgcn.workitem.id.x()
267 %init = bitcast i32 %tid to float
268 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
269 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
270 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
271 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
272 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
273 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
274 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
275 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
276 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
277 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
278 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
279 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
280 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
281 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
282 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
283 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
284 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
285 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
286 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
287 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
288 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
289 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
290 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
291 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
292 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
293 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
294 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
295 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
296 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
297 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
298 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
299 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
301 br label %for.cond.preheader
304 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
305 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
306 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
307 %inc = add nuw nsw i32 %c, 1
308 %cc = icmp eq i32 %inc, 16
309 br i1 %cc, label %exit, label %for.cond.preheader
312 store <32 x float> %mai.1, ptr addrspace(1) %arg
316 ; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
318 ; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
319 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
320 ; GFX940_A-COUNT-32: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
322 ; GCN: [[LOOP:.LBB[0-9_]+]]:
324 ; GFX908_A: v_mfma_f32_32x32x1f32
325 ; GFX940: v_mfma_f32_32x32x1_2b_f32
327 ; GCN: s_cbranch_scc1 [[LOOP]]
329 ; GFX908-COUNT-32: v_accvgpr_read_b32
330 ; GFX90A-NOT: v_accvgpr_read_b32
331 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
332 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
334 define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 {
336 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
337 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
338 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
339 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
340 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
341 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
342 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
343 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
344 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
345 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
346 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
347 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
348 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
349 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
350 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
351 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
352 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
353 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
354 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
355 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
356 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
357 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
358 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
359 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
360 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
361 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
362 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
363 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
364 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
365 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
366 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
367 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
369 br label %for.cond.preheader
372 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
373 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
374 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
375 %inc = add nuw nsw i32 %c, 1
376 %cc = icmp eq i32 %inc, 16
377 br i1 %cc, label %exit, label %for.cond.preheader
380 store <32 x float> %mai.1, ptr addrspace(1) %arg
384 ; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
386 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
387 ; GFX908-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
388 ; GFX940_A-DAG: s_load_dword [[TMP:s[0-9]+]],
389 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
390 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
391 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
392 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
393 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
394 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
395 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
396 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
397 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
398 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
399 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
400 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
401 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
402 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
403 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
404 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
405 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
406 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
407 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
408 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
409 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
410 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
411 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
412 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
413 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
414 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
415 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
416 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
417 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
418 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
419 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
421 ; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
422 ; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0
424 ; GCN: [[LOOP:.LBB[0-9_]+]]:
426 ; GFX908_A: v_mfma_f32_32x32x1f32
427 ; GFX940: v_mfma_f32_32x32x1_2b_f32
429 ; GCN: s_cbranch_scc1 [[LOOP]]
431 ; GFX908-COUNT-32: v_accvgpr_read_b32
432 ; GFX90A-NOT: v_accvgpr_read_b32
433 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
434 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
436 define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 {
438 %tid = call i32 @llvm.amdgcn.workitem.id.x()
439 %init = bitcast i32 %tid to float
440 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
441 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
443 br label %for.cond.preheader
446 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
447 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
448 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
449 %inc = add nuw nsw i32 %c, 1
450 %cc = icmp eq i32 %inc, 16
451 br i1 %cc, label %exit, label %for.cond.preheader
454 store <32 x float> %mai.1, ptr addrspace(1) %arg
458 ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
460 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
461 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
462 ; GFX90A-NOT: v_accvgpr
463 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
464 ; GFX90A-NOT: v_accvgpr
465 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
468 ; GCN: [[LOOP:.LBB[0-9_]+]]:
470 ; GFX908_A: v_mfma_f32_32x32x1f32
471 ; GFX940: v_mfma_f32_32x32x1_2b_f32
473 ; GCN: s_cbranch_scc1 [[LOOP]]
475 ; GFX908-COUNT-32: v_accvgpr_read_b32
476 ; GFX90A-NOT: v_accvgpr_read_b32
477 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
478 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
480 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 {
482 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
484 br label %for.cond.preheader
487 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
488 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
489 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
490 %inc = add nuw nsw i32 %c, 1
491 %cc = icmp eq i32 %inc, 16
492 br i1 %cc, label %exit, label %for.cond.preheader
495 store <32 x float> %mai.1, ptr addrspace(1) %arg
499 ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
501 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
502 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
503 ; GFX90A-NOT: v_accvgpr
504 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
505 ; GFX90A-NOT: v_accvgpr
506 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
508 ; Check that we are using only one tmp VGPR.
510 ; GFX908: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
511 ; GFX940_A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
513 ; GCN: [[LOOP:.LBB[0-9_]+]]:
515 ; GFX908_A: v_mfma_f32_32x32x1f32
516 ; GFX940: v_mfma_f32_32x32x1_2b_f32
518 ; GCN: s_cbranch_scc1 [[LOOP]]
520 ; GFX908-COUNT-32: v_accvgpr_read_b32
521 ; GFX90A-NOT: v_accvgpr_read_b32
522 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
523 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
525 define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
527 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
528 %init = extractelement <32 x float> %mai.0, i32 0
529 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
530 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
531 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
532 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
533 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
534 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
535 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
536 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
537 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
538 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
539 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
540 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
541 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
542 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
543 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
544 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
545 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
546 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
547 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
548 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
549 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
550 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
551 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
552 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
553 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
554 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
555 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
556 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
557 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
558 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
559 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
560 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
562 br label %for.cond.preheader
565 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
566 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
567 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
568 %inc = add nuw nsw i32 %c, 1
569 %cc = icmp eq i32 %inc, 16
570 br i1 %cc, label %exit, label %for.cond.preheader
573 store <32 x float> %mai.1, ptr addrspace(1) %arg
577 ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
579 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
580 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
581 ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
583 ; Check that we do not copy agprs to vgprs and back in an outer loop.
585 ; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
587 ; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
589 ; GFX908_A: v_mfma_f32_32x32x1f32
590 ; GFX940: v_mfma_f32_32x32x1_2b_f32
592 ; GCN: s_cbranch_scc1 [[INNER_LOOP]]
594 ; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
596 ; Final result should be read only once after the loop.
598 ; GFX908-COUNT-32: v_accvgpr_read_b32
599 ; GFX90A-NOT: v_accvgpr_read_b32
600 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
601 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
603 define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 {
605 br label %for.cond.preheader
608 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
609 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
610 br label %inner.for.cond.preheader
612 inner.for.cond.preheader:
613 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
614 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
615 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
616 %inc = add nuw nsw i32 %c, 1
617 %cc = icmp eq i32 %inc, 16
618 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
621 %inc.0 = add nuw nsw i32 %c.0, 1
622 %cc.0 = icmp eq i32 %inc.0, 16
623 br i1 %cc.0, label %exit, label %for.cond.preheader
626 store <32 x float> %mai.1, ptr addrspace(1) %arg
630 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
631 declare i32 @llvm.amdgcn.workitem.id.x()
633 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }