test/CodeGen/AMDGPU/mfma-loop.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   2
   3 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
   4
   5 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
   6 ; 3 vgprs are needed to avoid wait states between writes.
   7
   8 ; FIXME: We should not be using and temporary registers at all.
   9 ; At the moment we initialize an sgpr, then copy it via vgprs.
  10
  11 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
  12 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]
  13
  14 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
  15 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  16 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  17
  18 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  19 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  20 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  21
  22 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  23 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  24 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  25
  26 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  27 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  28 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  29
  30 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  31 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  32 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  33
  34 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  35 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  36 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  37
  38 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  39 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  40 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  41
  42 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  43 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  44 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  45
  46 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
  47 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
  48 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
  49
  50 ; Check that we do not copy agprs to vgprs and back inside the loop.
  51
  52 ; GCN: [[LOOP:BB[0-9_]+]]:
  53 ; GCN-NOT: v_accvgpr
  54 ; GCN: v_mfma_f32_32x32x1f32
  55 ; GCN-NOT: v_accvgpr
  56 ; GCN: s_cbranch_scc1 [[LOOP]]
  57
  58 ; Final result should be read only once after the loop.
  59
  60 ; GCN-COUNT32: v_accvgpr_read_b32
  61
  62 define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
  63 entry:
  64   br label %for.cond.preheader
  65
  66 for.cond.preheader:
  67   %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
  68   %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
  69   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
  70   %inc = add nuw nsw i32 %c, 1
  71   %cc = icmp eq i32 %inc, 16
  72   br i1 %cc, label %exit, label %for.cond.preheader
  73
  74 exit:
  75   store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
  76   ret void
  77 }
  78
  79 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
  80 declare i32 @llvm.amdgcn.workitem.id.x()