1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
5 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
6 ; 3 vgprs are needed to avoid wait states between writes.
8 ; FIXME: We should not be using and temporary registers at all.
9 ; At the moment we initialize an sgpr, then copy it via vgprs.
11 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
12 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]
14 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
15 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
16 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
18 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
19 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
20 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
22 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
23 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
24 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
26 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
27 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
28 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
30 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
31 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
32 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
34 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
35 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
36 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
38 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
39 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
40 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
42 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
43 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
44 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
46 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
47 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
48 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
50 ; Check that we do not copy agprs to vgprs and back inside the loop.
52 ; GCN: [[LOOP:BB[0-9_]+]]:
54 ; GCN: v_mfma_f32_32x32x1f32
56 ; GCN: s_cbranch_scc1 [[LOOP]]
58 ; Final result should be read only once after the loop.
60 ; GCN-COUNT32: v_accvgpr_read_b32
62 define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
64 br label %for.cond.preheader
67 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
68 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
69 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
70 %inc = add nuw nsw i32 %c, 1
71 %cc = icmp eq i32 %inc, 16
72 br i1 %cc, label %exit, label %for.cond.preheader
75 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
79 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
80 declare i32 @llvm.amdgcn.workitem.id.x()