1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s
4 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
6 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
7 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
8 ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
10 ; Check that we do not copy agprs to vgprs and back inside the loop.
12 ; GCN: [[LOOP:BB[0-9_]+]]:
14 ; GFX908_A: v_mfma_f32_32x32x1f32
16 ; GCN: s_cbranch_scc1 [[LOOP]]
18 ; Final result should be read only once after the loop.
20 ; GFX908-COUNT-32: v_accvgpr_read_b32
21 ; GFX90A-NOT: v_accvgpr_read_b32
22 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
23 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
25 define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
27 br label %for.cond.preheader
30 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
31 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
32 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
33 %inc = add nuw nsw i32 %c, 1
34 %cc = icmp eq i32 %inc, 16
35 br i1 %cc, label %exit, label %for.cond.preheader
38 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
42 ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
44 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
45 ; 3 vgprs are needed to avoid wait states between writes.
46 ; Check that we do not use 32 temp sgprs as well.
48 ; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
49 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
50 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
51 ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
53 ; GCN: [[LOOP:BB[0-9_]+]]:
55 ; GFX908_A: v_mfma_f32_32x32x1f32
57 ; GCN: s_cbranch_scc1 [[LOOP]]
59 ; GFX908-COUNT-32: v_accvgpr_read_b32
60 ; GFX90A-NOT: v_accvgpr_read_b32
61 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
62 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
64 define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
66 br label %for.cond.preheader
69 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
70 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
71 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
72 %inc = add nuw nsw i32 %c, 1
73 %cc = icmp eq i32 %inc, 16
74 br i1 %cc, label %exit, label %for.cond.preheader
77 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
81 ; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
83 ; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}}
84 ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
85 ; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
86 ; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}}
88 ; GCN: [[LOOP:BB[0-9_]+]]:
90 ; GFX908_A: v_mfma_f32_32x32x1f32
92 ; GCN: s_cbranch_scc1 [[LOOP]]
94 ; GFX908-COUNT-32: v_accvgpr_read_b32
95 ; GFX90A-NOT: v_accvgpr_read_b32
96 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
97 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
99 define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
101 br label %for.cond.preheader
104 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
105 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
106 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
107 %inc = add nuw nsw i32 %c, 1
108 %cc = icmp eq i32 %inc, 16
109 br i1 %cc, label %exit, label %for.cond.preheader
112 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
116 ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
118 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
119 ; 3 vgprs are needed to avoid wait states between writes.
121 ; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
122 ; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
123 ; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
124 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
125 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
126 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
127 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
128 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
129 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
130 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
131 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
132 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
133 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
134 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
135 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
136 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
137 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
138 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
139 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
140 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
141 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
142 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
143 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
144 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
145 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
146 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
147 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
148 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
149 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
150 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
151 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
152 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
153 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
154 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
155 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
156 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
157 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
158 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
159 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
160 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
161 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
162 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
163 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
164 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
165 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
166 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
167 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
168 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
169 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
170 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
171 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
172 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
173 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
174 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
175 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
176 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
177 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
178 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
179 ; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
180 ; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
181 ; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
182 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
183 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
184 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
186 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
187 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
188 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
189 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
190 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
191 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
192 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
193 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
194 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
195 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
196 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
197 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
198 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
199 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
200 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
201 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
202 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
203 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
204 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
205 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
206 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
207 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
208 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
209 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
210 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
211 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
212 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
213 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
214 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
215 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
216 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
217 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
218 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
219 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
220 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
221 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
222 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
223 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
224 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
225 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
226 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
227 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
228 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
229 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
230 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
231 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
232 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
233 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
234 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
235 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
236 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
237 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
238 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
239 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
240 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
241 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
242 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
243 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
244 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
245 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
246 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
247 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
248 ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
249 ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
252 ; GCN: [[LOOP:BB[0-9_]+]]:
254 ; GFX908_A: v_mfma_f32_32x32x1f32
256 ; GCN: s_cbranch_scc1 [[LOOP]]
258 ; GFX908-COUNT-32: v_accvgpr_read_b32
259 ; GFX90A-NOT: v_accvgpr_read_b32
260 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
261 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
263 define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
265 br label %for.cond.preheader
268 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
269 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
270 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
271 %inc = add nuw nsw i32 %c, 1
272 %cc = icmp eq i32 %inc, 16
273 br i1 %cc, label %exit, label %for.cond.preheader
276 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
280 ; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
282 ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
284 ; GCN: [[LOOP:BB[0-9_]+]]:
286 ; GFX908_A: v_mfma_f32_32x32x1f32
288 ; GCN: s_cbranch_scc1 [[LOOP]]
290 ; GFX908-COUNT-32: v_accvgpr_read_b32
291 ; GFX90A-NOT: v_accvgpr_read_b32
292 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
293 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
295 define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
297 %tid = call i32 @llvm.amdgcn.workitem.id.x()
298 %init = bitcast i32 %tid to float
299 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
300 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
301 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
302 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
303 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
304 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
305 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
306 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
307 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
308 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
309 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
310 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
311 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
312 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
313 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
314 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
315 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
316 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
317 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
318 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
319 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
320 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
321 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
322 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
323 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
324 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
325 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
326 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
327 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
328 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
329 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
330 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
332 br label %for.cond.preheader
335 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
336 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
337 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
338 %inc = add nuw nsw i32 %c, 1
339 %cc = icmp eq i32 %inc, 16
340 br i1 %cc, label %exit, label %for.cond.preheader
343 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
347 ; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
349 ; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
350 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
351 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
352 ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
354 ; GCN: [[LOOP:BB[0-9_]+]]:
356 ; GFX908_A: v_mfma_f32_32x32x1f32
358 ; GCN: s_cbranch_scc1 [[LOOP]]
360 ; GFX908-COUNT-32: v_accvgpr_read_b32
361 ; GFX90A-NOT: v_accvgpr_read_b32
362 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
363 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
365 define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
367 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
368 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
369 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
370 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
371 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
372 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
373 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
374 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
375 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
376 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
377 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
378 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
379 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
380 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
381 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
382 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
383 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
384 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
385 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
386 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
387 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
388 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
389 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
390 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
391 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
392 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
393 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
394 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
395 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
396 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
397 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
398 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
400 br label %for.cond.preheader
403 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
404 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
405 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
406 %inc = add nuw nsw i32 %c, 1
407 %cc = icmp eq i32 %inc, 16
408 br i1 %cc, label %exit, label %for.cond.preheader
411 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
415 ; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
417 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
418 ; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
419 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
420 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
421 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
422 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
423 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
424 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
425 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
426 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
427 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
428 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
429 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
430 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
431 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
432 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
433 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
434 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
435 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
436 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
437 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
438 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
439 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
440 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
441 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
442 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
443 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
444 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
445 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
446 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
447 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
448 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
449 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
451 ; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
452 ; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
454 ; GCN: [[LOOP:BB[0-9_]+]]:
456 ; GFX908_A: v_mfma_f32_32x32x1f32
458 ; GCN: s_cbranch_scc1 [[LOOP]]
460 ; GFX908-COUNT-32: v_accvgpr_read_b32
461 ; GFX90A-NOT: v_accvgpr_read_b32
462 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
463 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
465 define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
467 %tid = call i32 @llvm.amdgcn.workitem.id.x()
468 %init = bitcast i32 %tid to float
469 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
470 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
472 br label %for.cond.preheader
475 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
476 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
477 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
478 %inc = add nuw nsw i32 %c, 1
479 %cc = icmp eq i32 %inc, 16
480 br i1 %cc, label %exit, label %for.cond.preheader
483 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
487 ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
489 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
490 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
491 ; GFX90A-NOT: v_accvgpr
492 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
493 ; GFX90A-NOT: v_accvgpr
496 ; GCN: [[LOOP:BB[0-9_]+]]:
498 ; GFX908_A: v_mfma_f32_32x32x1f32
500 ; GCN: s_cbranch_scc1 [[LOOP]]
502 ; GFX908-COUNT-32: v_accvgpr_read_b32
503 ; GFX90A-NOT: v_accvgpr_read_b32
504 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
505 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
507 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
509 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
511 br label %for.cond.preheader
514 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
515 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
516 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
517 %inc = add nuw nsw i32 %c, 1
518 %cc = icmp eq i32 %inc, 16
519 br i1 %cc, label %exit, label %for.cond.preheader
522 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
526 ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
528 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
529 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
530 ; GFX90A-NOT: v_accvgpr
531 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
532 ; GFX90A-NOT: v_accvgpr
534 ; Check that we are using only one tmp VGPR.
536 ; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
537 ; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
538 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}}
539 ; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
541 ; GCN: [[LOOP:BB[0-9_]+]]:
543 ; GFX908_A: v_mfma_f32_32x32x1f32
545 ; GCN: s_cbranch_scc1 [[LOOP]]
547 ; GFX908-COUNT-32: v_accvgpr_read_b32
548 ; GFX90A-NOT: v_accvgpr_read_b32
549 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
550 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
552 define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
554 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
555 %init = extractelement <32 x float> %mai.0, i32 0
556 %tmp0 = insertelement <32 x float> undef, float %init, i32 0
557 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
558 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
559 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
560 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
561 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
562 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
563 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
564 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
565 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
566 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
567 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
568 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
569 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
570 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
571 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
572 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
573 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
574 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
575 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
576 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
577 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
578 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
579 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
580 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
581 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
582 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
583 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
584 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
585 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
586 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
587 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
589 br label %for.cond.preheader
592 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
593 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
594 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
595 %inc = add nuw nsw i32 %c, 1
596 %cc = icmp eq i32 %inc, 16
597 br i1 %cc, label %exit, label %for.cond.preheader
600 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
604 ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
606 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
607 ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
608 ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
610 ; Check that we do not copy agprs to vgprs and back in an outer loop.
612 ; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
614 ; GCN: [[INNER_LOOP:BB[0-9_]+]]:
616 ; GFX908_A: v_mfma_f32_32x32x1f32
618 ; GCN: s_cbranch_scc1 [[INNER_LOOP]]
620 ; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
622 ; Final result should be read only once after the loop.
624 ; GFX908-COUNT-32: v_accvgpr_read_b32
625 ; GFX90A-NOT: v_accvgpr_read_b32
626 ; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
627 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
629 define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
631 br label %for.cond.preheader
634 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
635 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
636 br label %inner.for.cond.preheader
638 inner.for.cond.preheader:
639 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
640 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
641 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
642 %inc = add nuw nsw i32 %c, 1
643 %cc = icmp eq i32 %inc, 16
644 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
647 %inc.0 = add nuw nsw i32 %c.0, 1
648 %cc.0 = icmp eq i32 %inc.0, 16
649 br i1 %cc.0, label %exit, label %for.cond.preheader
652 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
656 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
657 declare i32 @llvm.amdgcn.workitem.id.x()