1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
5 ; BF/F/S/UMLAL x1 (SINGLE)
8 define void @multi_vector_add_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
9 ; CHECK-LABEL: multi_vector_add_single_vg2x1_bf16:
11 ; CHECK-NEXT: mov w8, w0
12 ; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h
13 ; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h
15 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
16 %slice.14 = add i32 %slice, 14
17 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
21 define void @multi_vector_add_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
22 ; CHECK-LABEL: multi_vector_add_single_vg2x1_f16:
24 ; CHECK-NEXT: mov w8, w0
25 ; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h
26 ; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h
28 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
29 %slice.14 = add i32 %slice, 14
30 call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
34 define void @multi_vector_add_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
35 ; CHECK-LABEL: multi_vector_add_single_vg2x1_s16:
37 ; CHECK-NEXT: mov w8, w0
38 ; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h
39 ; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h
41 call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
42 %slice.14 = add i32 %slice, 14
43 call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
47 define void @multi_vector_add_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
48 ; CHECK-LABEL: multi_vector_add_single_vg2x1_u16:
50 ; CHECK-NEXT: mov w8, w0
51 ; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h
52 ; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h
54 call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
55 %slice.14 = add i32 %slice, 14
56 call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
61 ; BF/F/S/UMLSL x1 (SINGLE)
64 define void @multi_vector_sub_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
65 ; CHECK-LABEL: multi_vector_sub_single_vg2x1_bf16:
67 ; CHECK-NEXT: mov w8, w0
68 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h
69 ; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h
71 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
72 %slice.14 = add i32 %slice, 14
73 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
77 define void @multi_vector_sub_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
78 ; CHECK-LABEL: multi_vector_sub_single_vg2x1_f16:
80 ; CHECK-NEXT: mov w8, w0
81 ; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h
82 ; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h
84 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
85 %slice.14 = add i32 %slice, 14
86 call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
90 define void @multi_vector_sub_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
91 ; CHECK-LABEL: multi_vector_sub_single_vg2x1_s16:
93 ; CHECK-NEXT: mov w8, w0
94 ; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h
95 ; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h
97 call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
98 %slice.14 = add i32 %slice, 14
99 call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
103 define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
104 ; CHECK-LABEL: multi_vector_sub_single_vg2x1_u16:
106 ; CHECK-NEXT: mov w8, w0
107 ; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h
108 ; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h
110 call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
111 %slice.14 = add i32 %slice, 14
112 call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
117 ; BF/F/S/UMLAL x2 (SINGLE)
120 define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
121 ; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16:
123 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
124 ; CHECK-NEXT: mov w8, w0
125 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
126 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
127 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
129 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
130 %slice.6 = add i32 %slice, 6
131 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
135 define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
136 ; CHECK-LABEL: multi_vector_add_single_vg2x2_f16:
138 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
139 ; CHECK-NEXT: mov w8, w0
140 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
141 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
142 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
144 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
145 %slice.6 = add i32 %slice, 6
146 call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
150 define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
151 ; CHECK-LABEL: multi_vector_add_single_vg2x2_s16:
153 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
154 ; CHECK-NEXT: mov w8, w0
155 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
156 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
157 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
159 call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
160 %slice.6 = add i32 %slice, 6
161 call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
165 define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
166 ; CHECK-LABEL: multi_vector_add_single_vg2x2_u16:
168 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
169 ; CHECK-NEXT: mov w8, w0
170 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
171 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
172 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
174 call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
175 %slice.6 = add i32 %slice, 6
176 call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
181 ; BF/F/S/UMLSL x2 (SINGLE)
184 define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
185 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16:
187 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
188 ; CHECK-NEXT: mov w8, w0
189 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
190 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
191 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
193 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
194 %slice.6 = add i32 %slice, 6
195 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
199 define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
200 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16:
202 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
203 ; CHECK-NEXT: mov w8, w0
204 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
205 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
206 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
208 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
209 %slice.6 = add i32 %slice, 6
210 call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
214 define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
215 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16:
217 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
218 ; CHECK-NEXT: mov w8, w0
219 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
220 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
221 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
223 call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
224 %slice.6 = add i32 %slice, 6
225 call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
229 define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
230 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16:
232 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
233 ; CHECK-NEXT: mov w8, w0
234 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
235 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
236 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
238 call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
239 %slice.6 = add i32 %slice, 6
240 call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
245 ; BF/F/S/UMLAL x4 (SINGLE)
248 define void @multi_vector_add_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
249 ; CHECK-LABEL: multi_vector_add_single_vg2x4_bf16:
251 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
252 ; CHECK-NEXT: mov w8, w0
253 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
254 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
255 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
256 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
257 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
259 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice,
260 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
261 <vscale x 8 x bfloat> %zm)
262 %slice.6 = add i32 %slice, 6
263 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice.6,
264 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
265 <vscale x 8 x bfloat> %zm)
269 define void @multi_vector_add_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
270 ; CHECK-LABEL: multi_vector_add_single_vg2x4_f16:
272 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
273 ; CHECK-NEXT: mov w8, w0
274 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
275 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
276 ; CHECK-NEXT: mov z3.d, z2.d
277 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
278 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
280 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice,
281 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
282 <vscale x 8 x half> %zm)
283 %slice.6 = add i32 %slice, 6
284 call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice.6,
285 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
286 <vscale x 8 x half> %zm)
290 define void @multi_vector_add_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
291 ; CHECK-LABEL: multi_vector_add_single_vg2x4_s16:
293 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
294 ; CHECK-NEXT: mov w8, w0
295 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
296 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
297 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
298 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
299 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
301 call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice,
302 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
303 <vscale x 8 x i16> %zm)
304 %slice.6 = add i32 %slice, 6
305 call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice.6,
306 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
307 <vscale x 8 x i16> %zm)
311 define void @multi_vector_add_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
312 ; CHECK-LABEL: multi_vector_add_single_vg2x4_u16:
314 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
315 ; CHECK-NEXT: mov w8, w0
316 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
317 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
319 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
320 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
322 call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice,
323 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
324 <vscale x 8 x i16> %zm)
325 %slice.6 = add i32 %slice, 6
326 call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice.6,
327 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
328 <vscale x 8 x i16> %zm)
333 ; BF/F/S/UMLSL x4 (SINGLE)
336 define void @multi_vector_sub_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
337 ; CHECK-LABEL: multi_vector_sub_single_vg2x4_bf16:
339 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
340 ; CHECK-NEXT: mov w8, w0
341 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
342 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
343 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
344 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
345 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
347 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice,
348 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
349 <vscale x 8 x bfloat> %zm)
350 %slice.6 = add i32 %slice, 6
351 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice.6,
352 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
353 <vscale x 8 x bfloat> %zm)
357 define void @multi_vector_sub_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
358 ; CHECK-LABEL: multi_vector_sub_single_vg2x4_f16:
360 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
361 ; CHECK-NEXT: mov w8, w0
362 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
363 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
364 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
365 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
366 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
368 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice,
369 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
370 <vscale x 8 x half> %zm)
371 %slice.6 = add i32 %slice, 6
372 call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice.6,
373 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
374 <vscale x 8 x half> %zm)
378 define void @multi_vector_sub_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
379 ; CHECK-LABEL: multi_vector_sub_single_vg2x4_s16:
381 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
382 ; CHECK-NEXT: mov w8, w0
383 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
384 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
385 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
386 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
387 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
389 call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice,
390 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
391 <vscale x 8 x i16> %zm)
392 %slice.6 = add i32 %slice, 6
393 call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice.6,
394 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
395 <vscale x 8 x i16> %zm)
399 define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
400 ; CHECK-LABEL: multi_vector_sub_single_vg2x4_u16:
402 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
403 ; CHECK-NEXT: mov w8, w0
404 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
405 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
406 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
407 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
408 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
410 call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice,
411 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
412 <vscale x 8 x i16> %zm)
413 %slice.6 = add i32 %slice, 6
414 call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice.6,
415 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
416 <vscale x 8 x i16> %zm)
421 ; BF/F/S/UMLAL x2 (MULTI)
424 define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
425 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16:
427 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
428 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
429 ; CHECK-NEXT: mov w8, w0
430 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
431 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
432 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
433 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
435 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
436 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
437 %slice.6 = add i32 %slice, 6
438 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
439 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
443 define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
444 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16:
446 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
447 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
448 ; CHECK-NEXT: mov w8, w0
449 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
450 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
451 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
452 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
454 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
455 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
456 %slice.6 = add i32 %slice, 6
457 call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
458 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
462 define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
463 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16:
465 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
466 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
467 ; CHECK-NEXT: mov w8, w0
468 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
469 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
470 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
471 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
473 call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
474 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
475 %slice.6 = add i32 %slice, 6
476 call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
477 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
481 define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
482 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16:
484 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
485 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
486 ; CHECK-NEXT: mov w8, w0
487 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
488 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
489 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
490 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
492 call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
493 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
494 %slice.6 = add i32 %slice, 6
495 call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
496 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
501 ; BF/F/S/UMLSL x2 (MULTI)
504 define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
505 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16:
507 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
508 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
509 ; CHECK-NEXT: mov w8, w0
510 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
511 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
512 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
513 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
515 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
516 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
517 %slice.6 = add i32 %slice, 6
518 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
519 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
523 define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
524 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16:
526 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
527 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
528 ; CHECK-NEXT: mov w8, w0
529 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
530 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
531 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
532 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
534 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
535 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
536 %slice.6 = add i32 %slice, 6
537 call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
538 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
542 define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
543 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16:
545 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
546 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
547 ; CHECK-NEXT: mov w8, w0
548 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
549 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
550 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
551 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
553 call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
554 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
555 %slice.6 = add i32 %slice, 6
556 call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
557 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
561 define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
562 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16:
564 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
565 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
566 ; CHECK-NEXT: mov w8, w0
567 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
568 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
569 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
570 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
572 call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
573 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
574 %slice.6 = add i32 %slice, 6
575 call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
576 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
581 ; BF/F/S/UMLAL x4 (MULTI)
584 define void @multi_vector_add_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
585 ; CHECK-LABEL: multi_vector_add_multi_vg2x4_bf16:
587 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
588 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
589 ; CHECK-NEXT: mov w8, w0
590 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
591 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
592 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
593 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
594 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
595 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
596 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
597 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
599 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
600 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice,
601 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
602 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
603 %slice.6 = add i32 %slice, 6
604 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice.6,
605 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
606 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
610 define void @multi_vector_add_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
611 ; CHECK-LABEL: multi_vector_add_multi_vg2x4_f16:
613 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
614 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
615 ; CHECK-NEXT: mov w8, w0
616 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
617 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
618 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
619 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
620 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
621 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
622 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
623 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
625 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
626 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice,
627 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
628 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
629 %slice.6 = add i32 %slice, 6
630 call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice.6,
631 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
632 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
636 define void @multi_vector_add_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
637 ; CHECK-LABEL: multi_vector_add_multi_vg2x4_s16:
639 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
640 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
641 ; CHECK-NEXT: mov w8, w0
642 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
643 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
644 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
645 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
646 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
647 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
648 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
649 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
651 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
652 call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice,
653 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
654 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
655 %slice.6 = add i32 %slice, 6
656 call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice.6,
657 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
658 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
662 define void @multi_vector_add_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
663 ; CHECK-LABEL: multi_vector_add_multi_vg2x4_u16:
665 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
666 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
667 ; CHECK-NEXT: mov w8, w0
668 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
669 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
670 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
671 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
672 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
673 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
674 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
675 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
677 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
678 call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice,
679 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
680 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
681 %slice.6 = add i32 %slice, 6
682 call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice.6,
683 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
684 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
689 ; BF/F/S/UMLSL x4 (MULTI)
692 define void @multi_vector_sub_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
693 ; CHECK-LABEL: multi_vector_sub_multi_vg2x4_bf16:
695 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
696 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
697 ; CHECK-NEXT: mov w8, w0
698 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
699 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
700 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
701 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
702 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
703 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
704 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
705 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
707 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
708 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice,
709 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
710 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
711 %slice.6 = add i32 %slice, 6
712 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice.6,
713 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
714 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
718 define void @multi_vector_sub_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
719 ; CHECK-LABEL: multi_vector_sub_multi_vg2x4_f16:
721 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
722 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
723 ; CHECK-NEXT: mov w8, w0
724 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
725 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
726 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
727 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
728 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
729 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
730 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
731 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
733 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
734 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice,
735 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
736 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
737 %slice.6 = add i32 %slice, 6
738 call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice.6,
739 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
740 <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
744 define void @multi_vector_sub_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
745 ; CHECK-LABEL: multi_vector_sub_multi_vg2x4_s16:
747 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
748 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
749 ; CHECK-NEXT: mov w8, w0
750 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
751 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
752 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
753 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
754 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
755 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
756 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
757 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
759 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
760 call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice,
761 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
762 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
763 %slice.6 = add i32 %slice, 6
764 call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice.6,
765 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
766 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
770 define void @multi_vector_sub_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
771 ; CHECK-LABEL: multi_vector_sub_multi_vg2x4_u16:
773 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
774 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
775 ; CHECK-NEXT: mov w8, w0
776 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
777 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
778 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
779 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
780 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
781 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
782 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
783 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
785 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
786 call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice,
787 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
788 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
789 %slice.6 = add i32 %slice, 6
790 call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice.6,
791 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
792 <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
797 ; BF/F/S/UMLAL x1 (INDEXED)
800 define void @multi_vector_add_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
801 ; CHECK-LABEL: multi_vector_add_lane_vg2x1_f16:
803 ; CHECK-NEXT: mov w8, w0
804 ; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h[0]
805 ; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h[7]
807 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
808 %slice.14 = add i32 %slice, 14
809 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
813 define void @multi_vector_add_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
814 ; CHECK-LABEL: multi_vector_add_lane_vg2x1_bf16:
816 ; CHECK-NEXT: mov w8, w0
817 ; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h[0]
818 ; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h[7]
820 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
821 %slice.14 = add i32 %slice, 14
822 call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
826 define void @multi_vector_add_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
827 ; CHECK-LABEL: multi_vector_add_lane_vg2x1_s16:
829 ; CHECK-NEXT: mov w8, w0
830 ; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h[0]
831 ; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h[7]
833 call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
834 %slice.14 = add i32 %slice, 14
835 call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
839 define void @multi_vector_add_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
840 ; CHECK-LABEL: multi_vector_add_lane_vg2x1_u16:
842 ; CHECK-NEXT: mov w8, w0
843 ; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h[0]
844 ; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h[7]
846 call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
847 %slice.14 = add i32 %slice, 14
848 call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
853 ; BF/F/S/UMLSL x1 (INDEXED)
856 define void @multi_vector_sub_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
857 ; CHECK-LABEL: multi_vector_sub_lane_vg2x1_f16:
859 ; CHECK-NEXT: mov w8, w0
860 ; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h[0]
861 ; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h[7]
863 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
864 %slice.14 = add i32 %slice, 14
865 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
869 define void @multi_vector_sub_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
870 ; CHECK-LABEL: multi_vector_sub_lane_vg2x1_bf16:
872 ; CHECK-NEXT: mov w8, w0
873 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h[0]
874 ; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h[7]
876 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
877 %slice.14 = add i32 %slice, 14
878 call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
882 define void @multi_vector_sub_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
883 ; CHECK-LABEL: multi_vector_sub_lane_vg2x1_s16:
885 ; CHECK-NEXT: mov w8, w0
886 ; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h[0]
887 ; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h[7]
889 call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
890 %slice.14 = add i32 %slice, 14
891 call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
895 define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
896 ; CHECK-LABEL: multi_vector_sub_lane_vg2x1_u16:
898 ; CHECK-NEXT: mov w8, w0
899 ; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h[0]
900 ; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h[7]
902 call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
903 %slice.14 = add i32 %slice, 14
904 call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
909 ; BF/F/S/UMLAL x2 (INDEXED)
912 define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
913 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16:
915 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
916 ; CHECK-NEXT: mov w8, w0
917 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
918 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
919 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
921 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice,
922 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
923 %slice.6 = add i32 %slice, 6
924 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice.6,
925 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
929 define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
930 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16:
932 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
933 ; CHECK-NEXT: mov w8, w0
934 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
935 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
936 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
938 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice,
939 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
940 %slice.6 = add i32 %slice, 6
941 call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice.6,
942 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
946 define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
947 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16:
949 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
950 ; CHECK-NEXT: mov w8, w0
951 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
952 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
953 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
955 call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice,
956 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
957 %slice.6 = add i32 %slice, 6
958 call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice.6,
959 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
963 define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
964 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16:
966 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
967 ; CHECK-NEXT: mov w8, w0
968 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
969 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
970 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
972 call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice,
973 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
974 %slice.6 = add i32 %slice, 6
975 call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice.6,
976 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
981 ; BF/F/S/UMLSL x2 (INDEXED)
984 define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
985 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16:
987 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
988 ; CHECK-NEXT: mov w8, w0
989 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
990 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
991 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
993 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice,
994 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
995 %slice.6 = add i32 %slice, 6
996 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice.6,
997 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
1001 define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
1002 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16:
1004 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1005 ; CHECK-NEXT: mov w8, w0
1006 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1007 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1008 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1010 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice,
1011 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
1012 %slice.6 = add i32 %slice, 6
1013 call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice.6,
1014 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
1018 define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1019 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16:
1021 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1022 ; CHECK-NEXT: mov w8, w0
1023 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1024 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1025 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1027 call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice,
1028 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1029 %slice.6 = add i32 %slice, 6
1030 call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
1031 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1035 define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1036 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16:
1038 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1039 ; CHECK-NEXT: mov w8, w0
1040 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1041 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1042 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1044 call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice,
1045 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1046 %slice.6 = add i32 %slice, 6
1047 call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
1048 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1053 ; BF/F/S/UMLAL x4 (INDEXED)
1056 define void @multi_vector_add_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
1057 ; CHECK-LABEL: multi_vector_add_lane_vg2x4_f16:
1059 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1060 ; CHECK-NEXT: mov w8, w0
1061 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1062 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1063 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1064 ; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1065 ; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1067 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice,
1068 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1069 <vscale x 8 x half> %zm, i32 0)
1070 %slice.6 = add i32 %slice, 6
1071 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice.6,
1072 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1073 <vscale x 8 x half> %zm, i32 7)
1077 define void @multi_vector_add_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
1078 ; CHECK-LABEL: multi_vector_add_lane_vg2x4_bf16:
1080 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1081 ; CHECK-NEXT: mov w8, w0
1082 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1083 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1084 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1085 ; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1086 ; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1088 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice,
1089 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1090 <vscale x 8 x bfloat> %zm, i32 0)
1091 %slice.6 = add i32 %slice, 6
1092 call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice.6,
1093 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1094 <vscale x 8 x bfloat> %zm, i32 7)
1098 define void @multi_vector_add_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1099 ; CHECK-LABEL: multi_vector_add_lane_vg2x4_s16:
1101 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1102 ; CHECK-NEXT: mov w8, w0
1103 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1104 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1105 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1106 ; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1107 ; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1109 call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice,
1110 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1111 <vscale x 8 x i16> %zm, i32 0)
1112 %slice.6 = add i32 %slice, 6
1113 call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice.6,
1114 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1115 <vscale x 8 x i16> %zm, i32 7)
1119 define void @multi_vector_add_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1120 ; CHECK-LABEL: multi_vector_add_lane_vg2x4_u16:
1122 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1123 ; CHECK-NEXT: mov w8, w0
1124 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1125 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1126 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1127 ; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1128 ; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1130 call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice,
1131 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1132 <vscale x 8 x i16> %zm, i32 0)
1133 %slice.6 = add i32 %slice, 6
1134 call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice.6,
1135 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1136 <vscale x 8 x i16> %zm, i32 7)
1141 ; BF/F/S/UMLSL x4 (INDEXED)
1144 define void @multi_vector_sub_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
1145 ; CHECK-LABEL: multi_vector_sub_lane_vg2x4_f16:
1147 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1148 ; CHECK-NEXT: mov w8, w0
1149 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1150 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1151 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1152 ; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1153 ; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1155 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice,
1156 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1157 <vscale x 8 x half> %zm, i32 0)
1158 %slice.6 = add i32 %slice, 6
1159 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice.6,
1160 <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1161 <vscale x 8 x half> %zm, i32 7)
1165 define void @multi_vector_sub_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
1166 ; CHECK-LABEL: multi_vector_sub_lane_vg2x4_bf16:
1168 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1169 ; CHECK-NEXT: mov w8, w0
1170 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1171 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1172 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1173 ; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1174 ; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1176 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice,
1177 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1178 <vscale x 8 x bfloat> %zm, i32 0)
1179 %slice.6 = add i32 %slice, 6
1180 call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice.6,
1181 <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1182 <vscale x 8 x bfloat> %zm, i32 7)
1186 define void @multi_vector_sub_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1187 ; CHECK-LABEL: multi_vector_sub_lane_vg2x4_s16:
1189 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1190 ; CHECK-NEXT: mov w8, w0
1191 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1192 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1193 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1194 ; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1195 ; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1197 call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice,
1198 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1199 <vscale x 8 x i16> %zm, i32 0)
1200 %slice.6 = add i32 %slice, 6
1201 call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
1202 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1203 <vscale x 8 x i16> %zm, i32 7)
1207 define void @multi_vector_sub_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1208 ; CHECK-LABEL: multi_vector_sub_lane_vg2x4_u16:
1210 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1211 ; CHECK-NEXT: mov w8, w0
1212 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1213 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1214 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1215 ; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1216 ; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1218 call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice,
1219 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1220 <vscale x 8 x i16> %zm, i32 0)
1221 %slice.6 = add i32 %slice, 6
1222 call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
1223 <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1224 <vscale x 8 x i16> %zm, i32 7)
1228 declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1229 declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
1230 declare void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1231 declare void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1233 declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1234 declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
1235 declare void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1236 declare void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1238 declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1239 declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1240 declare void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1241 declare void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1243 declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1244 declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1245 declare void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1246 declare void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1248 declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1249 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1250 declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
1251 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1252 declare void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1253 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1254 declare void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1255 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1257 declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1258 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1259 declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
1260 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1261 declare void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1262 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1263 declare void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1264 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1266 declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1267 declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1268 declare void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1269 declare void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1271 declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1272 declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1273 declare void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1274 declare void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1276 declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1277 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1278 declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
1279 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1280 declare void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1281 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1282 declare void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1283 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1285 declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1286 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1287 declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
1288 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1289 declare void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1290 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1291 declare void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1292 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1294 declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1295 declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1296 declare void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1297 declare void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1299 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1300 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1301 declare void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1302 declare void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1304 declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1305 declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1306 declare void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1307 declare void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1309 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1310 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1311 declare void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1312 declare void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1314 declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1315 declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1316 declare void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1317 declare void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1319 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1320 declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1321 declare void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1322 declare void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)