llvm/test/CodeGen/AMDGPU/acc-ldst.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
   2
   3 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
   4 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
   5 declare i32 @llvm.amdgcn.workitem.id.x()
   6
   7 ; GCN-LABEL:  {{^}}test_load_mfma_store16:
   8 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
   9 ; GCN-NOT:     v_accvgpr_write
  10 ; GCN:         v_mfma_f32_32x32x1f32
  11 ; GCN-NEXT:    s_nop 7
  12 ; GCN-NEXT:    s_nop 7
  13 ; GCN-NEXT:    s_nop 2
  14 ; GCN-NOT:     v_accvgpr_read
  15 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
  16 define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) #0 {
  17 bb:
  18   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  19   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
  20   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
  21   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
  22   store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}test_load1_mfma_store1:
  27 ; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
  28 ; GCN-NOT:  v_accvgpr_read
  29 ; GCN:      v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
  30 ; GCN-NEXT: s_nop 7
  31 ; GCN-NEXT: s_nop 7
  32 ; GCN-NEXT: s_nop 2
  33 ; GCN-NOT:  v_accvgpr_read
  34 ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
  35 define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) #0 {
  36 bb:
  37   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  38   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
  39   %in.1 = load float, float addrspace(1)* %gep
  40   %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0
  41   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3)
  42   %elt = extractelement <32 x float> %mai.1, i32 0
  43   store float %elt, float addrspace(1)* %gep
  44   ret void
  45 }
  46
  47 ; GCN-LABEL: {{^}}test_load4_mfma_store4:
  48 ; GCN:      global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
  49 ; GCN-NOT:  v_accvgpr_write
  50 ; GCN:      v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]]
  51 ; GCN-NEXT: s_nop 4
  52 ; GCN-NOT:  v_accvgpr_read
  53 ; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}]
  54 define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) #0 {
  55 bb:
  56   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  57   %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
  58   %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep
  59   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
  60   store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
  61   ret void
  62 }
  63
  64 ; GCN-LABEL: {{^}}test_load_store:
  65 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
  66 ; GCN-NOT:     v_accvgpr
  67 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
  68 define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) #0 {
  69 bb:
  70   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  71   %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
  72   %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
  73   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
  74   store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2
  75   ret void
  76 }
  77
  78 ; GCN-LABEL: {{^}}test_load_add_mfma_store:
  79 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
  80 ; GCN-COUNT-32: v_accvgpr_write
  81 ; GCN:          v_mfma_f32_32x32x1f32
  82 ; GCN-NEXT:     s_nop 7
  83 ; GCN-NEXT:     s_nop 7
  84 ; GCN-NEXT:     s_nop 2
  85 ; GCN-NOT:      v_accvgpr_read
  86 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
  87 define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
  88 bb:
  89   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  90   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
  91   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
  92   %add.1 = fadd <32 x float> %in.1, %in.1
  93   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
  94   store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
  95   ret void
  96 }
  97
  98 ; GCN-LABEL: {{^}}test_load_add_store:
  99 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 100 ; GCN-NOT:      v_accvgpr
 101 ; GCN-COUNT-16: v_pk_add_f32
 102 ; GCN-NOT:      v_accvgpr
 103 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
 104 define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) #0 {
 105 bb:
 106   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 107   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 108   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
 109   %add.1 = fadd <32 x float> %in.1, %in.1
 110   store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
 111   ret void
 112 }
 113
 114 ; GCN-LABEL: {{^}}test_load_mfma_add_store:
 115 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 116 ; GCN-COUNT-32: v_accvgpr_write
 117 ; GCN:          v_mfma_f32_32x32x1f32
 118 ; GCN-COUNT-32: v_accvgpr_read
 119 ; GCN:          v_pk_add_f32
 120 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
 121 define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) #0 {
 122 bb:
 123   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 124   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 125   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
 126   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
 127   %add.1 = fadd <32 x float> %mai.1, %in.1
 128   store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
 129   ret void
 130 }
 131
 132 ; GCN-LABEL: {{^}}test_load_add_mfma_mul_store:
 133 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 134 ; GCN:          v_pk_add_f32
 135 ; GCN-COUNT-32: v_accvgpr_write
 136 ; GCN:          v_mfma_f32_32x32x1f32
 137 ; GCN-COUNT-32: v_accvgpr_read
 138 ; GCN:          v_pk_mul_f32
 139 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
 140 define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
 141 bb:
 142   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 143   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 144   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
 145   %add.1 = fadd <32 x float> %in.1, %in.1
 146   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
 147   %mul.1 = fmul <32 x float> %mai.1, %mai.1
 148   store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
 149   ret void
 150 }
 151
 152 ; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store:
 153 ; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 154 ; GCN-COUNT-32: v_accvgpr_write
 155 ; GCN:          v_mfma_f32_32x32x1f32
 156 ; GCN-COUNT-32: v_accvgpr_read
 157 ; GCN:          v_pk_mul_f32
 158 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
 159 define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
 160 bb:
 161   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 162   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 163   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
 164   %add.1 = fadd <32 x float> %in.1, %in.1
 165   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
 166   %mul.1 = fmul <32 x float> %mai.1, %in.1
 167   store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
 168   ret void
 169 }
 170
 171 ; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store:
 172 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 173 ; GCN-NOT:     v_accvgpr_write
 174 ; GCN:         v_mfma_f32_32x32x1f32
 175 ; GCN-NOT:     v_accvgpr_read
 176 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
 177 define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
 178 bb:
 179   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 180   %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 181   %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
 182   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
 183   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
 184   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
 185   store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1
 186   store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2
 187   ret void
 188 }
 189
 190 ; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
 191 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
 192 ; GCN:     global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
 193 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
 194 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 195 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 196 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 197 ; GCN:     v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 198 ; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
 199 ; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
 200 ; GCN:     global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
 201 define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) #0 {
 202 bb:
 203   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 204   %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid
 205   %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
 206   %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
 207   %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
 208   %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
 209   %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
 210   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
 211   %elt = extractelement <4 x i32> %mai.1, i32 0
 212   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst
 213   store i32 %val, i32 addrspace(1)* %arg
 214   ret void
 215 }
 216
 217 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
 218 ; GCN:         global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
 219 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 220 ; GCN:         v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 221 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 222 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 223 ; GCN:         global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
 224 define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) #0 {
 225 bb:
 226   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 227   %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
 228   %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst
 229   %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
 230   %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
 231   %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
 232   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0)
 233   %elt.1 = extractelement <4 x i32> %mai.1, i32 0
 234   %elt.2 = extractelement <4 x i32> %mai.1, i32 1
 235   %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
 236   %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
 237   %v2 = bitcast <2 x i32> %v2.2 to i64
 238   %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst
 239   store i64 %val, i64 addrspace(1)* %arg
 240   ret void
 241 }
 242
 243 ; NB: both data operands should be VGPR or AGPR
 244 ; GCN-LABEL: {{^}}test_load_mfma_ds2_store:
 245 ; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}
 246 ; GCN-NOT: v_accvgpr_write
 247 ; GCN-DAG: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
 248 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 249 ; GCN-NOT: v_accvgpr_read
 250 ; GCN:     ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
 251 define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) #0 {
 252 bb:
 253   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 254   %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid
 255   %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1
 256   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
 257   %elt = extractelement <4 x i32> %mai.1, i32 0
 258   %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)*
 259   %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32
 260   store i32 1, i32 addrspace(3)* %ptr
 261   store i32 %elt, i32 addrspace(3)* %gep.2
 262   ret void
 263 }
 264
 265 ; GCN-LABEL: {{^}}test_mfma_loop_4xi32:
 266 ; GCN:     global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 267 ; GCN-NOT: v_accvgpr_write
 268 ; GCN:     v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
 269 ; GCN-NOT: v_accvgpr_read
 270 ; GCN:     global_store_dwordx4 v[{{[0-9:]+}}], [[RES]],
 271 define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) #0 {
 272 entry:
 273   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 274   %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
 275   %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep
 276   br label %for.cond.preheader
 277
 278 for.cond.preheader:
 279   %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
 280   %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
 281   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0)
 282   %inc = add nuw nsw i32 %c, 1
 283   %cc = icmp eq i32 %inc, 16
 284   br i1 %cc, label %exit, label %for.cond.preheader
 285
 286 exit:
 287   store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
 288   ret void
 289 }
 290
 291 ; GCN-LABEL: {{^}}test_mfma_loop_32xfloat:
 292 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 293 ; GCN-NOT:     v_accvgpr_write
 294 ; GCN:         v_mfma_f32_32x32x1f32
 295 ; GCN-NOT:     v_accvgpr_read
 296 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}],
 297 ; GCN:         s_endpgm
 298 define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) #0 {
 299 entry:
 300   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 301   %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
 302   %in = load <32 x float>, <32 x float> addrspace(1)* %gep
 303   br label %for.cond.preheader
 304
 305 for.cond.preheader:
 306   %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
 307   %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
 308   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
 309   %inc = add nuw nsw i32 %c, 1
 310   %cc = icmp eq i32 %inc, 16
 311   br i1 %cc, label %exit, label %for.cond.preheader
 312
 313 exit:
 314   store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
 315   ret void
 316 }
 317
 318 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }