1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:" --version 4
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f8f16,+sme-f8f32 -force-streaming < %s | FileCheck %s
6 define void @test_fmlal_vg2x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
7 ; CHECK-LABEL: test_fmlal_vg2x1:
10 ; CHECK: fmlal za.h[w8, 0:1], z0.b, z1.b[0]
11 ; CHECK: fmlal za.h[w8, 14:15], z0.b, z1.b[15]
13 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 %slice,
14 <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm,
16 %add = add i32 %slice, 14
17 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x1(i32 %add,
18 <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm,
23 define void @test_fmlal_vg2x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
24 ; CHECK-LABEL: test_fmlal_vg2x2:
27 ; CHECK: fmlal za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z2.b[0]
28 ; CHECK: fmlal za.h[w8, 6:7, vgx2], { z0.b, z1.b }, z2.b[15]
30 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 %slice,
31 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
32 <vscale x 16 x i8> %zm,
34 %add = add i32 %slice, 6
35 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x2(i32 %add,
36 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
37 <vscale x 16 x i8> %zm,
42 define void @test_fmlal_vg2x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
43 ; CHECK-LABEL: test_fmlal_vg2x4:
46 ; CHECK: fmlal za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z4.b[0]
47 ; CHECK: fmlal za.h[w8, 6:7, vgx4], { z0.b - z3.b }, z4.b[15]
49 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 %slice,
50 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
51 <vscale x 16 x i8> %zm,
53 %add = add i32 %slice, 6
54 call void @llvm.aarch64.sme.fp8.fmlal.lane.za16.vg2x4(i32 %add,
55 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
56 <vscale x 16 x i8> %zm,
63 define void @test_fmlall_vg4x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
64 ; CHECK-LABEL: test_fmlall_vg4x1:
67 ; CHECK: fmlall za.s[w8, 0:3], z0.b, z1.b[0]
68 ; CHECK: fmlall za.s[w8, 12:15], z0.b, z1.b[15]
70 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 %slice,
71 <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm,
73 %add = add i32 %slice, 12
74 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x1(i32 %add,
75 <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm,
80 define void @test_fmlall_vg4x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
81 ; CHECK-LABEL: test_fmlall_vg4x2:
84 ; CHECK: fmlall za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z2.b[0]
85 ; CHECK: fmlall za.s[w8, 4:7, vgx2], { z0.b, z1.b }, z2.b[15]
87 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 %slice,
88 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
89 <vscale x 16 x i8> %zm,
91 %add = add i32 %slice, 4
92 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x2(i32 %add,
93 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
94 <vscale x 16 x i8> %zm,
99 define void @test_fmlall_vg4x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
100 ; CHECK-LABEL: test_fmlall_vg4x4:
103 ; CHECK: fmlall za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z4.b[8]
104 ; CHECK: fmlall za.s[w8, 4:7, vgx4], { z0.b - z3.b }, z4.b[15]
106 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 %slice,
107 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
108 <vscale x 16 x i8> %zm,
110 %add = add i32 %slice, 4
111 call void @llvm.aarch64.sme.fp8.fmlall.lane.za32.vg4x4(i32 %add,
112 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
113 <vscale x 16 x i8> %zm,
120 define void @test_fmlal_single_vg2x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
121 ; CHECK-LABEL: test_fmlal_single_vg2x1:
124 ; CHECK: fmlal za.h[w8, 0:1], z0.b, z1.b
125 ; CHECK: fmlal za.h[w8, 14:15], z0.b, z1.b
127 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
128 %add = add i32 %slice, 14
129 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 %add, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
133 define void @test_fmlal_single_vg2x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
134 ; CHECK-LABEL: test_fmlal_single_vg2x2:
137 ; CHECK: fmlal za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z2.b
138 ; CHECK: fmlal za.h[w8, 6:7, vgx2], { z0.b, z1.b }, z2.b
140 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 %slice,
141 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
142 <vscale x 16 x i8> %zm)
143 %add = add i32 %slice, 6
144 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 %add,
145 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
146 <vscale x 16 x i8> %zm)
150 define void @test_fmlal_single_vg2x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
151 ; CHECK-LABEL: test_fmlal_single_vg2x4:
154 ; CHECK: fmlal za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z4.b
155 ; CHECK: fmlal za.h[w8, 6:7, vgx4], { z0.b - z3.b }, z4.b
157 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 %slice,
158 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
159 <vscale x 16 x i8> %zm)
160 %add = add i32 %slice, 6
161 call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 %add,
162 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
163 <vscale x 16 x i8> %zm)
169 define void @test_fmlall_single_vg4x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
170 ; CHECK-LABEL: test_fmlall_single_vg4x1:
173 ; CHECK: fmlall za.s[w8, 0:3], z0.b, z1.b
174 ; CHECK: fmlall za.s[w8, 12:15], z0.b, z1.b
176 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
177 %add = add i32 %slice, 12
178 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 %add, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
183 define void @test_fmlall_single_vg4x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
184 ; CHECK-LABEL: test_fmlall_single_vg4x2:
187 ; CHECK: fmlall za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z2.b
188 ; CHECK: fmlall za.s[w8, 4:7, vgx2], { z0.b, z1.b }, z2.b
190 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 %slice,
191 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
192 <vscale x 16 x i8> %zm)
193 %add = add i32 %slice, 4
194 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 %add,
195 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
196 <vscale x 16 x i8> %zm)
200 define void @test_fmlall_single_vg4x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
201 ; CHECK-LABEL: test_fmlall_single_vg4x4:
204 ; CHECK: fmlall za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z4.b
205 ; CHECK: fmlall za.s[w8, 4:7, vgx4], { z0.b - z3.b }, z4.b
207 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 %slice,
208 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
209 <vscale x 16 x i8> %zm)
210 %add = add i32 %slice, 4
211 call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 %add,
212 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
213 <vscale x 16 x i8> %zm)
219 define void @test_fmlal_multi_vg2x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
220 ; CHECK-LABEL: test_fmlal_multi_vg2x2:
223 ; CHECK: fmlal za.h[w8, 0:1, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
224 ; CHECK: fmlal za.h[w8, 6:7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
226 call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x2(i32 %slice,
227 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
228 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
229 %add = add i32 %slice, 6
230 call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x2(i32 %add,
231 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
232 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
236 define void @test_fmlal_multi_vg2x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
237 ; CHECK-LABEL: test_fmlal_multi_vg2x4:
240 ; CHECK: fmlal za.h[w8, 0:1, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
241 ; CHECK: fmlal za.h[w8, 6:7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
243 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
244 call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x4(i32 %slice,
245 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
246 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
247 %add = add i32 %slice, 6
248 call void @llvm.aarch64.sme.fp8.fmlal.multi.za16.vg2x4(i32 %add,
249 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
250 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
256 define void @test_fmlal_multi_vg4x2(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
257 ; CHECK-LABEL: test_fmlal_multi_vg4x2:
260 ; CHECK: fmlall za.s[w8, 0:3, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
261 ; CHECK: fmlall za.s[w8, 4:7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
263 call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x2(i32 %slice,
264 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
265 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
266 %add = add i32 %slice, 4
267 call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x2(i32 %add,
268 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1,
269 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
273 define void @test_fmlal_multi_vg4x4(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
274 ; CHECK-LABEL: test_fmlal_multi_vg4x4:
277 ; CHECK: fmlall za.s[w8, 0:3, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
278 ; CHECK: fmlall za.s[w8, 4:7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
280 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
281 call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x4(i32 %slice,
282 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
283 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
284 %add = add i32 %slice, 4
285 call void @llvm.aarch64.sme.fp8.fmlall.multi.za32.vg4x4(i32 %add,
286 <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
287 <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)