1 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
3 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
4 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
5 declare i32 @llvm.amdgcn.workitem.id.x()
7 ; GCN-LABEL: {{^}}test_load_mfma_store16:
8 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
9 ; GCN-NOT: v_accvgpr_write
10 ; GCN: v_mfma_f32_32x32x1f32
14 ; GCN-NOT: v_accvgpr_read
15 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
16 define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) #0 {
18 %tid = call i32 @llvm.amdgcn.workitem.id.x()
19 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
20 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
21 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
22 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
26 ; GCN-LABEL: {{^}}test_load1_mfma_store1:
27 ; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
28 ; GCN-NOT: v_accvgpr_read
29 ; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
33 ; GCN-NOT: v_accvgpr_read
34 ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
35 define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) #0 {
37 %tid = call i32 @llvm.amdgcn.workitem.id.x()
38 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
39 %in.1 = load float, float addrspace(1)* %gep
40 %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0
41 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3)
42 %elt = extractelement <32 x float> %mai.1, i32 0
43 store float %elt, float addrspace(1)* %gep
47 ; GCN-LABEL: {{^}}test_load4_mfma_store4:
48 ; GCN: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
49 ; GCN-NOT: v_accvgpr_write
50 ; GCN: v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]]
52 ; GCN-NOT: v_accvgpr_read
53 ; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}]
54 define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) #0 {
56 %tid = call i32 @llvm.amdgcn.workitem.id.x()
57 %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
58 %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep
59 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
60 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
64 ; GCN-LABEL: {{^}}test_load_store:
65 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
67 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
68 define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) #0 {
70 %tid = call i32 @llvm.amdgcn.workitem.id.x()
71 %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
72 %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
73 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
74 store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2
78 ; GCN-LABEL: {{^}}test_load_add_mfma_store:
79 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
80 ; GCN-COUNT-32: v_accvgpr_write
81 ; GCN: v_mfma_f32_32x32x1f32
85 ; GCN-NOT: v_accvgpr_read
86 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
87 define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
89 %tid = call i32 @llvm.amdgcn.workitem.id.x()
90 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
91 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
92 %add.1 = fadd <32 x float> %in.1, %in.1
93 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
94 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
98 ; GCN-LABEL: {{^}}test_load_add_store:
99 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
101 ; GCN-COUNT-16: v_pk_add_f32
103 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
104 define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) #0 {
106 %tid = call i32 @llvm.amdgcn.workitem.id.x()
107 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
108 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
109 %add.1 = fadd <32 x float> %in.1, %in.1
110 store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
114 ; GCN-LABEL: {{^}}test_load_mfma_add_store:
115 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
116 ; GCN-COUNT-32: v_accvgpr_write
117 ; GCN: v_mfma_f32_32x32x1f32
118 ; GCN-COUNT-32: v_accvgpr_read
120 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
121 define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) #0 {
123 %tid = call i32 @llvm.amdgcn.workitem.id.x()
124 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
125 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
126 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
127 %add.1 = fadd <32 x float> %mai.1, %in.1
128 store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
132 ; GCN-LABEL: {{^}}test_load_add_mfma_mul_store:
133 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
135 ; GCN-COUNT-32: v_accvgpr_write
136 ; GCN: v_mfma_f32_32x32x1f32
137 ; GCN-COUNT-32: v_accvgpr_read
139 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
140 define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
142 %tid = call i32 @llvm.amdgcn.workitem.id.x()
143 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
144 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
145 %add.1 = fadd <32 x float> %in.1, %in.1
146 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
147 %mul.1 = fmul <32 x float> %mai.1, %mai.1
148 store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
152 ; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store:
153 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
154 ; GCN-COUNT-32: v_accvgpr_write
155 ; GCN: v_mfma_f32_32x32x1f32
156 ; GCN-COUNT-32: v_accvgpr_read
158 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
159 define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
161 %tid = call i32 @llvm.amdgcn.workitem.id.x()
162 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
163 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
164 %add.1 = fadd <32 x float> %in.1, %in.1
165 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
166 %mul.1 = fmul <32 x float> %mai.1, %in.1
167 store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
171 ; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store:
172 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
173 ; GCN-NOT: v_accvgpr_write
174 ; GCN: v_mfma_f32_32x32x1f32
175 ; GCN-NOT: v_accvgpr_read
176 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
177 define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
179 %tid = call i32 @llvm.amdgcn.workitem.id.x()
180 %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
181 %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
182 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
183 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
184 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
185 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1
186 store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2
190 ; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
191 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
192 ; GCN: global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
193 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
194 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
195 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
196 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
197 ; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
198 ; GCN: v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
199 ; GCN: global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
200 ; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
201 define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) #0 {
203 %tid = call i32 @llvm.amdgcn.workitem.id.x()
204 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid
205 %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
206 %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
207 %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
208 %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
209 %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
210 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
211 %elt = extractelement <4 x i32> %mai.1, i32 0
212 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst
213 store i32 %val, i32 addrspace(1)* %arg
217 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
218 ; GCN: global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
219 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
220 ; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
221 ; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
222 ; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
223 ; GCN: global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
224 define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) #0 {
226 %tid = call i32 @llvm.amdgcn.workitem.id.x()
227 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
228 %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst
229 %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
230 %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
231 %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
232 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0)
233 %elt.1 = extractelement <4 x i32> %mai.1, i32 0
234 %elt.2 = extractelement <4 x i32> %mai.1, i32 1
235 %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
236 %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
237 %v2 = bitcast <2 x i32> %v2.2 to i64
238 %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst
239 store i64 %val, i64 addrspace(1)* %arg
243 ; NB: both data operands should be VGPR or AGPR
244 ; GCN-LABEL: {{^}}test_load_mfma_ds2_store:
245 ; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}
246 ; GCN-NOT: v_accvgpr_write
247 ; GCN-DAG: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
248 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
249 ; GCN-NOT: v_accvgpr_read
250 ; GCN: ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
251 define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) #0 {
253 %tid = call i32 @llvm.amdgcn.workitem.id.x()
254 %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid
255 %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1
256 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
257 %elt = extractelement <4 x i32> %mai.1, i32 0
258 %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)*
259 %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32
260 store i32 1, i32 addrspace(3)* %ptr
261 store i32 %elt, i32 addrspace(3)* %gep.2
265 ; GCN-LABEL: {{^}}test_mfma_loop_4xi32:
266 ; GCN: global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}]
267 ; GCN-NOT: v_accvgpr_write
268 ; GCN: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
269 ; GCN-NOT: v_accvgpr_read
270 ; GCN: global_store_dwordx4 v[{{[0-9:]+}}], [[RES]],
271 define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) #0 {
273 %tid = call i32 @llvm.amdgcn.workitem.id.x()
274 %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
275 %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep
276 br label %for.cond.preheader
279 %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
280 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
281 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0)
282 %inc = add nuw nsw i32 %c, 1
283 %cc = icmp eq i32 %inc, 16
284 br i1 %cc, label %exit, label %for.cond.preheader
287 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
291 ; GCN-LABEL: {{^}}test_mfma_loop_32xfloat:
292 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
293 ; GCN-NOT: v_accvgpr_write
294 ; GCN: v_mfma_f32_32x32x1f32
295 ; GCN-NOT: v_accvgpr_read
296 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}],
298 define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) #0 {
300 %tid = call i32 @llvm.amdgcn.workitem.id.x()
301 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
302 %in = load <32 x float>, <32 x float> addrspace(1)* %gep
303 br label %for.cond.preheader
306 %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
307 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
308 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
309 %inc = add nuw nsw i32 %c, 1
310 %cc = icmp eq i32 %inc, 16
311 br i1 %cc, label %exit, label %for.cond.preheader
314 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
318 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }