1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
10 define void @multi_vector_mul_add_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
11 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s8:
13 ; CHECK-NEXT: mov w8, w0
14 ; CHECK-NEXT: smlall za.s[w8, 0:3], z1.b, z2.b
15 ; CHECK-NEXT: smlall za.s[w8, 12:15], z1.b, z2.b
17 call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
18 %slice.12 = add i32 %slice, 12
19 call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
23 define void @multi_vector_mul_add_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
24 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s16:
26 ; CHECK-NEXT: mov w8, w0
27 ; CHECK-NEXT: smlall za.d[w8, 0:3], z1.h, z2.h
28 ; CHECK-NEXT: smlall za.d[w8, 12:15], z1.h, z2.h
30 call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
31 %slice.12 = add i32 %slice, 12
32 call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
38 define void @multi_vector_mul_add_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
39 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s8:
41 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
42 ; CHECK-NEXT: mov w8, w0
43 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
44 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
45 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
47 call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
48 %slice.4 = add i32 %slice, 4
49 call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
53 define void @multi_vector_mul_add_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
54 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s16:
56 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
57 ; CHECK-NEXT: mov w8, w0
58 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
59 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
60 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
62 call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
63 %slice.4 = add i32 %slice, 4
64 call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
70 define void @multi_vector_mul_add_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
71 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s8:
73 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
74 ; CHECK-NEXT: mov w8, w0
75 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
76 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
77 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
78 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
79 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
81 call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
82 %slice.4 = add i32 %slice, 4
83 call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
87 define void @multi_vector_mul_add_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
88 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s16:
90 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
91 ; CHECK-NEXT: mov w8, w0
92 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
93 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
94 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
95 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
96 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
98 call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
99 %slice.4 = add i32 %slice, 4
100 call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
106 define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
107 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8:
109 ; CHECK-NEXT: mov z5.d, z4.d
110 ; CHECK-NEXT: mov z7.d, z2.d
111 ; CHECK-NEXT: mov w8, w0
112 ; CHECK-NEXT: mov z4.d, z3.d
113 ; CHECK-NEXT: mov z6.d, z1.d
114 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
115 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
117 call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
118 %slice.4 = add i32 %slice, 4
119 call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
123 define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
124 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16:
126 ; CHECK-NEXT: mov z5.d, z4.d
127 ; CHECK-NEXT: mov z7.d, z2.d
128 ; CHECK-NEXT: mov w8, w0
129 ; CHECK-NEXT: mov z4.d, z3.d
130 ; CHECK-NEXT: mov z6.d, z1.d
131 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
132 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
134 call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
135 %slice.4 = add i32 %slice, 4
136 call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
142 define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
143 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8:
145 ; CHECK-NEXT: mov z26.d, z7.d
146 ; CHECK-NEXT: mov z31.d, z4.d
147 ; CHECK-NEXT: mov w8, w0
148 ; CHECK-NEXT: ptrue p0.b
149 ; CHECK-NEXT: mov z25.d, z6.d
150 ; CHECK-NEXT: mov z30.d, z3.d
151 ; CHECK-NEXT: mov z24.d, z5.d
152 ; CHECK-NEXT: mov z29.d, z2.d
153 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
154 ; CHECK-NEXT: mov z28.d, z1.d
155 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
156 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
158 call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
159 %slice.4 = add i32 %slice, 4
160 call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
164 define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
165 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16:
167 ; CHECK-NEXT: mov z26.d, z7.d
168 ; CHECK-NEXT: mov z31.d, z4.d
169 ; CHECK-NEXT: mov w8, w0
170 ; CHECK-NEXT: ptrue p0.h
171 ; CHECK-NEXT: mov z25.d, z6.d
172 ; CHECK-NEXT: mov z30.d, z3.d
173 ; CHECK-NEXT: mov z24.d, z5.d
174 ; CHECK-NEXT: mov z29.d, z2.d
175 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
176 ; CHECK-NEXT: mov z28.d, z1.d
177 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
178 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
180 call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
181 %slice.4 = add i32 %slice, 4
182 call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
188 define void @multi_vector_mul_add_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
189 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s8:
191 ; CHECK-NEXT: mov w8, w0
192 ; CHECK-NEXT: smlall za.s[w8, 0:3], z1.b, z2.b[0]
193 ; CHECK-NEXT: smlall za.s[w8, 12:15], z1.b, z2.b[15]
195 call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
196 %slice.12 = add i32 %slice, 12
197 call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
201 define void @multi_vector_mul_add_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
202 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s16:
204 ; CHECK-NEXT: mov w8, w0
205 ; CHECK-NEXT: smlall za.d[w8, 0:3], z1.h, z2.h[0]
206 ; CHECK-NEXT: smlall za.d[w8, 12:15], z1.h, z2.h[7]
208 call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
209 %slice.12 = add i32 %slice, 12
210 call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
216 define void @multi_vector_mul_add_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
217 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s8:
219 ; CHECK-NEXT: mov z5.d, z2.d
220 ; CHECK-NEXT: mov w8, w0
221 ; CHECK-NEXT: mov z4.d, z1.d
222 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
223 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
225 call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
226 %slice.4 = add i32 %slice, 4
227 call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
231 define void @multi_vector_mul_add_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
232 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s16:
234 ; CHECK-NEXT: mov z5.d, z2.d
235 ; CHECK-NEXT: mov w8, w0
236 ; CHECK-NEXT: mov z4.d, z1.d
237 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
238 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
240 call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
241 %slice.4 = add i32 %slice, 4
242 call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
248 define void @multi_vector_mul_add_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
249 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s8:
251 ; CHECK-NEXT: mov z27.d, z4.d
252 ; CHECK-NEXT: mov w8, w0
253 ; CHECK-NEXT: mov z26.d, z3.d
254 ; CHECK-NEXT: mov z25.d, z2.d
255 ; CHECK-NEXT: mov z24.d, z1.d
256 ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
257 ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
259 call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
260 %slice.4 = add i32 %slice, 4
261 call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
265 define void @multi_vector_mul_add_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
266 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s16:
268 ; CHECK-NEXT: mov z27.d, z4.d
269 ; CHECK-NEXT: mov w8, w0
270 ; CHECK-NEXT: mov z26.d, z3.d
271 ; CHECK-NEXT: mov z25.d, z2.d
272 ; CHECK-NEXT: mov z24.d, z1.d
273 ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
274 ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
276 call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
277 %slice.4 = add i32 %slice, 4
278 call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
286 define void @multi_vector_mul_add_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
287 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u8:
289 ; CHECK-NEXT: mov w8, w0
290 ; CHECK-NEXT: umlall za.s[w8, 0:3], z1.b, z2.b
291 ; CHECK-NEXT: umlall za.s[w8, 12:15], z1.b, z2.b
293 call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
294 %slice.12 = add i32 %slice, 12
295 call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
299 define void @multi_vector_mul_add_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
300 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u16:
302 ; CHECK-NEXT: mov w8, w0
303 ; CHECK-NEXT: umlall za.d[w8, 0:3], z1.h, z2.h
304 ; CHECK-NEXT: umlall za.d[w8, 12:15], z1.h, z2.h
306 call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
307 %slice.12 = add i32 %slice, 12
308 call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
314 define void @multi_vector_mul_add_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
315 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u8:
317 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
318 ; CHECK-NEXT: mov w8, w0
319 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
320 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
321 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
323 call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
324 %slice.4 = add i32 %slice, 4
325 call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
329 define void @multi_vector_mul_add_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
330 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u16:
332 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
333 ; CHECK-NEXT: mov w8, w0
334 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
335 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
336 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
338 call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
339 %slice.4 = add i32 %slice, 4
340 call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
346 define void @multi_vector_mul_add_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
347 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u8:
349 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
350 ; CHECK-NEXT: mov w8, w0
351 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
352 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
353 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
354 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
355 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
357 call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
358 %slice.4 = add i32 %slice, 4
359 call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
363 define void @multi_vector_mul_add_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
364 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u16:
366 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
367 ; CHECK-NEXT: mov w8, w0
368 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
369 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
370 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
371 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
372 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
374 call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
375 %slice.4 = add i32 %slice, 4
376 call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
382 define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
383 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8:
385 ; CHECK-NEXT: mov z5.d, z4.d
386 ; CHECK-NEXT: mov z7.d, z2.d
387 ; CHECK-NEXT: mov w8, w0
388 ; CHECK-NEXT: mov z4.d, z3.d
389 ; CHECK-NEXT: mov z6.d, z1.d
390 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
391 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
393 call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
394 %slice.4 = add i32 %slice, 4
395 call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
399 define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
400 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16:
402 ; CHECK-NEXT: mov z5.d, z4.d
403 ; CHECK-NEXT: mov z7.d, z2.d
404 ; CHECK-NEXT: mov w8, w0
405 ; CHECK-NEXT: mov z4.d, z3.d
406 ; CHECK-NEXT: mov z6.d, z1.d
407 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
408 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
410 call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
411 %slice.4 = add i32 %slice, 4
412 call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
418 define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
419 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8:
421 ; CHECK-NEXT: mov z26.d, z7.d
422 ; CHECK-NEXT: mov z31.d, z4.d
423 ; CHECK-NEXT: mov w8, w0
424 ; CHECK-NEXT: ptrue p0.b
425 ; CHECK-NEXT: mov z25.d, z6.d
426 ; CHECK-NEXT: mov z30.d, z3.d
427 ; CHECK-NEXT: mov z24.d, z5.d
428 ; CHECK-NEXT: mov z29.d, z2.d
429 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
430 ; CHECK-NEXT: mov z28.d, z1.d
431 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
432 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
434 call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
435 %slice.4 = add i32 %slice, 4
436 call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
440 define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
441 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16:
443 ; CHECK-NEXT: mov z26.d, z7.d
444 ; CHECK-NEXT: mov z31.d, z4.d
445 ; CHECK-NEXT: mov w8, w0
446 ; CHECK-NEXT: ptrue p0.h
447 ; CHECK-NEXT: mov z25.d, z6.d
448 ; CHECK-NEXT: mov z30.d, z3.d
449 ; CHECK-NEXT: mov z24.d, z5.d
450 ; CHECK-NEXT: mov z29.d, z2.d
451 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
452 ; CHECK-NEXT: mov z28.d, z1.d
453 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
454 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
456 call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
457 %slice.4 = add i32 %slice, 4
458 call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
464 define void @multi_vector_mul_add_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
465 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u8:
467 ; CHECK-NEXT: mov w8, w0
468 ; CHECK-NEXT: umlall za.s[w8, 0:3], z1.b, z2.b[0]
469 ; CHECK-NEXT: umlall za.s[w8, 12:15], z1.b, z2.b[15]
471 call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
472 %slice.12 = add i32 %slice, 12
473 call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
477 define void @multi_vector_mul_add_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
478 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u16:
480 ; CHECK-NEXT: mov w8, w0
481 ; CHECK-NEXT: umlall za.d[w8, 0:3], z1.h, z2.h[0]
482 ; CHECK-NEXT: umlall za.d[w8, 12:15], z1.h, z2.h[7]
484 call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
485 %slice.12 = add i32 %slice, 12
486 call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
492 define void @multi_vector_mul_add_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
493 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u8:
495 ; CHECK-NEXT: mov z5.d, z2.d
496 ; CHECK-NEXT: mov w8, w0
497 ; CHECK-NEXT: mov z4.d, z1.d
498 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
499 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
501 call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
502 %slice.4 = add i32 %slice, 4
503 call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
507 define void @multi_vector_mul_add_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
508 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u16:
510 ; CHECK-NEXT: mov z5.d, z2.d
511 ; CHECK-NEXT: mov w8, w0
512 ; CHECK-NEXT: mov z4.d, z1.d
513 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
514 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
516 call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
517 %slice.4 = add i32 %slice, 4
518 call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
524 define void @multi_vector_mul_add_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
525 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u8:
527 ; CHECK-NEXT: mov z27.d, z4.d
528 ; CHECK-NEXT: mov w8, w0
529 ; CHECK-NEXT: mov z26.d, z3.d
530 ; CHECK-NEXT: mov z25.d, z2.d
531 ; CHECK-NEXT: mov z24.d, z1.d
532 ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
533 ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
535 call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
536 %slice.4 = add i32 %slice, 4
537 call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
541 define void @multi_vector_mul_add_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
542 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u16:
544 ; CHECK-NEXT: mov z27.d, z4.d
545 ; CHECK-NEXT: mov w8, w0
546 ; CHECK-NEXT: mov z26.d, z3.d
547 ; CHECK-NEXT: mov z25.d, z2.d
548 ; CHECK-NEXT: mov z24.d, z1.d
549 ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
550 ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
552 call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
553 %slice.4 = add i32 %slice, 4
554 call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
562 define void @multi_vector_mul_sub_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
563 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s8:
565 ; CHECK-NEXT: mov w8, w0
566 ; CHECK-NEXT: smlsll za.s[w8, 0:3], z1.b, z2.b
567 ; CHECK-NEXT: smlsll za.s[w8, 12:15], z1.b, z2.b
569 call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
570 %slice.12 = add i32 %slice, 12
571 call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
575 define void @multi_vector_mul_sub_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
576 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s16:
578 ; CHECK-NEXT: mov w8, w0
579 ; CHECK-NEXT: smlsll za.d[w8, 0:3], z1.h, z2.h
580 ; CHECK-NEXT: smlsll za.d[w8, 12:15], z1.h, z2.h
582 call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
583 %slice.12 = add i32 %slice, 12
584 call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
590 define void @multi_vector_mul_sub_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
591 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s8:
593 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
594 ; CHECK-NEXT: mov w8, w0
595 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
596 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
597 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
599 call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
600 %slice.4 = add i32 %slice, 4
601 call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
605 define void @multi_vector_mul_sub_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
606 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s16:
608 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
609 ; CHECK-NEXT: mov w8, w0
610 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
611 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
612 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
614 call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
615 %slice.4 = add i32 %slice, 4
616 call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
622 define void @multi_vector_mul_sub_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
623 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s8:
625 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
626 ; CHECK-NEXT: mov w8, w0
627 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
628 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
629 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
630 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
631 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
633 call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
634 %slice.4 = add i32 %slice, 4
635 call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
639 define void @multi_vector_mul_sub_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
640 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s16:
642 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
643 ; CHECK-NEXT: mov w8, w0
644 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
645 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
646 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
647 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
648 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
650 call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
651 %slice.4 = add i32 %slice, 4
652 call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
658 define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
659 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8:
661 ; CHECK-NEXT: mov z5.d, z4.d
662 ; CHECK-NEXT: mov z7.d, z2.d
663 ; CHECK-NEXT: mov w8, w0
664 ; CHECK-NEXT: mov z4.d, z3.d
665 ; CHECK-NEXT: mov z6.d, z1.d
666 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
667 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
669 call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
670 %slice.4 = add i32 %slice, 4
671 call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
675 define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
676 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16:
678 ; CHECK-NEXT: mov z5.d, z4.d
679 ; CHECK-NEXT: mov z7.d, z2.d
680 ; CHECK-NEXT: mov w8, w0
681 ; CHECK-NEXT: mov z4.d, z3.d
682 ; CHECK-NEXT: mov z6.d, z1.d
683 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
684 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
686 call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
687 %slice.4 = add i32 %slice, 4
688 call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
694 define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
695 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8:
697 ; CHECK-NEXT: mov z26.d, z7.d
698 ; CHECK-NEXT: mov z31.d, z4.d
699 ; CHECK-NEXT: mov w8, w0
700 ; CHECK-NEXT: ptrue p0.b
701 ; CHECK-NEXT: mov z25.d, z6.d
702 ; CHECK-NEXT: mov z30.d, z3.d
703 ; CHECK-NEXT: mov z24.d, z5.d
704 ; CHECK-NEXT: mov z29.d, z2.d
705 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
706 ; CHECK-NEXT: mov z28.d, z1.d
707 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
708 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
710 call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
711 %slice.4 = add i32 %slice, 4
712 call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
716 define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
717 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16:
719 ; CHECK-NEXT: mov z26.d, z7.d
720 ; CHECK-NEXT: mov z31.d, z4.d
721 ; CHECK-NEXT: mov w8, w0
722 ; CHECK-NEXT: ptrue p0.h
723 ; CHECK-NEXT: mov z25.d, z6.d
724 ; CHECK-NEXT: mov z30.d, z3.d
725 ; CHECK-NEXT: mov z24.d, z5.d
726 ; CHECK-NEXT: mov z29.d, z2.d
727 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
728 ; CHECK-NEXT: mov z28.d, z1.d
729 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
730 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
732 call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
733 %slice.4 = add i32 %slice, 4
734 call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
740 define void @multi_vector_mul_sub_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
741 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s8:
743 ; CHECK-NEXT: mov w8, w0
744 ; CHECK-NEXT: smlsll za.s[w8, 0:3], z1.b, z2.b[0]
745 ; CHECK-NEXT: smlsll za.s[w8, 12:15], z1.b, z2.b[15]
747 call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
748 %slice.12 = add i32 %slice, 12
749 call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
753 define void @multi_vector_mul_sub_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
754 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s16:
756 ; CHECK-NEXT: mov w8, w0
757 ; CHECK-NEXT: smlsll za.d[w8, 0:3], z1.h, z2.h[0]
758 ; CHECK-NEXT: smlsll za.d[w8, 12:15], z1.h, z2.h[7]
760 call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
761 %slice.12 = add i32 %slice, 12
762 call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
768 define void @multi_vector_mul_sub_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
769 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s8:
771 ; CHECK-NEXT: mov z5.d, z2.d
772 ; CHECK-NEXT: mov w8, w0
773 ; CHECK-NEXT: mov z4.d, z1.d
774 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
775 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
777 call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
778 %slice.4 = add i32 %slice, 4
779 call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
783 define void @multi_vector_mul_sub_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
784 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s16:
786 ; CHECK-NEXT: mov z5.d, z2.d
787 ; CHECK-NEXT: mov w8, w0
788 ; CHECK-NEXT: mov z4.d, z1.d
789 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
790 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
792 call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
793 %slice.4 = add i32 %slice, 4
794 call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
800 define void @multi_vector_mul_sub_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
801 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s8:
803 ; CHECK-NEXT: mov z27.d, z4.d
804 ; CHECK-NEXT: mov w8, w0
805 ; CHECK-NEXT: mov z26.d, z3.d
806 ; CHECK-NEXT: mov z25.d, z2.d
807 ; CHECK-NEXT: mov z24.d, z1.d
808 ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
809 ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
811 call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
812 %slice.4 = add i32 %slice, 4
813 call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
817 define void @multi_vector_mul_sub_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
818 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s16:
820 ; CHECK-NEXT: mov z27.d, z4.d
821 ; CHECK-NEXT: mov w8, w0
822 ; CHECK-NEXT: mov z26.d, z3.d
823 ; CHECK-NEXT: mov z25.d, z2.d
824 ; CHECK-NEXT: mov z24.d, z1.d
825 ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
826 ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
828 call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
829 %slice.4 = add i32 %slice, 4
830 call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
838 define void @multi_vector_mul_sub_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
839 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u8:
841 ; CHECK-NEXT: mov w8, w0
842 ; CHECK-NEXT: umlsll za.s[w8, 0:3], z1.b, z2.b
843 ; CHECK-NEXT: umlsll za.s[w8, 12:15], z1.b, z2.b
845 call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
846 %slice.12 = add i32 %slice, 12
847 call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
851 define void @multi_vector_mul_sub_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
852 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u16:
854 ; CHECK-NEXT: mov w8, w0
855 ; CHECK-NEXT: umlsll za.d[w8, 0:3], z1.h, z2.h
856 ; CHECK-NEXT: umlsll za.d[w8, 12:15], z1.h, z2.h
858 call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
859 %slice.12 = add i32 %slice, 12
860 call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
866 define void @multi_vector_mul_sub_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
867 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u8:
869 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
870 ; CHECK-NEXT: mov w8, w0
871 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
872 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
873 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
875 call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
876 %slice.4 = add i32 %slice, 4
877 call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
881 define void @multi_vector_mul_sub_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
882 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u16:
884 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
885 ; CHECK-NEXT: mov w8, w0
886 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
887 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
888 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
890 call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
891 %slice.4 = add i32 %slice, 4
892 call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
898 define void @multi_vector_mul_sub_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
899 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u8:
901 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
902 ; CHECK-NEXT: mov w8, w0
903 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
904 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
905 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
906 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
907 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
909 call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
910 %slice.4 = add i32 %slice, 4
911 call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
915 define void @multi_vector_mul_sub_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
916 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u16:
918 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
919 ; CHECK-NEXT: mov w8, w0
920 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
921 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
922 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
923 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
924 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
926 call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
927 %slice.4 = add i32 %slice, 4
928 call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
934 define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
935 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8:
937 ; CHECK-NEXT: mov z5.d, z4.d
938 ; CHECK-NEXT: mov z7.d, z2.d
939 ; CHECK-NEXT: mov w8, w0
940 ; CHECK-NEXT: mov z4.d, z3.d
941 ; CHECK-NEXT: mov z6.d, z1.d
942 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
943 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
945 call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
946 %slice.4 = add i32 %slice, 4
947 call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
951 define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
952 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16:
954 ; CHECK-NEXT: mov z5.d, z4.d
955 ; CHECK-NEXT: mov z7.d, z2.d
956 ; CHECK-NEXT: mov w8, w0
957 ; CHECK-NEXT: mov z4.d, z3.d
958 ; CHECK-NEXT: mov z6.d, z1.d
959 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
960 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
962 call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
963 %slice.4 = add i32 %slice, 4
964 call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
970 define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
971 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8:
973 ; CHECK-NEXT: mov z26.d, z7.d
974 ; CHECK-NEXT: mov z31.d, z4.d
975 ; CHECK-NEXT: mov w8, w0
976 ; CHECK-NEXT: ptrue p0.b
977 ; CHECK-NEXT: mov z25.d, z6.d
978 ; CHECK-NEXT: mov z30.d, z3.d
979 ; CHECK-NEXT: mov z24.d, z5.d
980 ; CHECK-NEXT: mov z29.d, z2.d
981 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
982 ; CHECK-NEXT: mov z28.d, z1.d
983 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
984 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
986 call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
987 %slice.4 = add i32 %slice, 4
988 call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
992 define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
993 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16:
995 ; CHECK-NEXT: mov z26.d, z7.d
996 ; CHECK-NEXT: mov z31.d, z4.d
997 ; CHECK-NEXT: mov w8, w0
998 ; CHECK-NEXT: ptrue p0.h
999 ; CHECK-NEXT: mov z25.d, z6.d
1000 ; CHECK-NEXT: mov z30.d, z3.d
1001 ; CHECK-NEXT: mov z24.d, z5.d
1002 ; CHECK-NEXT: mov z29.d, z2.d
1003 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
1004 ; CHECK-NEXT: mov z28.d, z1.d
1005 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
1006 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
1008 call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
1009 %slice.4 = add i32 %slice, 4
1010 call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
1016 define void @multi_vector_mul_sub_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1017 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u8:
1019 ; CHECK-NEXT: mov w8, w0
1020 ; CHECK-NEXT: umlsll za.s[w8, 0:3], z1.b, z2.b[0]
1021 ; CHECK-NEXT: umlsll za.s[w8, 12:15], z1.b, z2.b[15]
1023 call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1024 %slice.12 = add i32 %slice, 12
1025 call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1029 define void @multi_vector_mul_sub_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
1030 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u16:
1032 ; CHECK-NEXT: mov w8, w0
1033 ; CHECK-NEXT: umlsll za.d[w8, 0:3], z1.h, z2.h[0]
1034 ; CHECK-NEXT: umlsll za.d[w8, 12:15], z1.h, z2.h[7]
1036 call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
1037 %slice.12 = add i32 %slice, 12
1038 call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
1044 define void @multi_vector_mul_sub_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1045 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u8:
1047 ; CHECK-NEXT: mov z5.d, z2.d
1048 ; CHECK-NEXT: mov w8, w0
1049 ; CHECK-NEXT: mov z4.d, z1.d
1050 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1051 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1053 call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1054 %slice.4 = add i32 %slice, 4
1055 call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1059 define void @multi_vector_mul_sub_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1060 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u16:
1062 ; CHECK-NEXT: mov z5.d, z2.d
1063 ; CHECK-NEXT: mov w8, w0
1064 ; CHECK-NEXT: mov z4.d, z1.d
1065 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
1066 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
1068 call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1069 %slice.4 = add i32 %slice, 4
1070 call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1076 define void @multi_vector_mul_sub_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1077 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u8:
1079 ; CHECK-NEXT: mov z27.d, z4.d
1080 ; CHECK-NEXT: mov w8, w0
1081 ; CHECK-NEXT: mov z26.d, z3.d
1082 ; CHECK-NEXT: mov z25.d, z2.d
1083 ; CHECK-NEXT: mov z24.d, z1.d
1084 ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1085 ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1087 call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1088 %slice.4 = add i32 %slice, 4
1089 call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1093 define void @multi_vector_mul_sub_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1094 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u16:
1096 ; CHECK-NEXT: mov z27.d, z4.d
1097 ; CHECK-NEXT: mov w8, w0
1098 ; CHECK-NEXT: mov z26.d, z3.d
1099 ; CHECK-NEXT: mov z25.d, z2.d
1100 ; CHECK-NEXT: mov z24.d, z1.d
1101 ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
1102 ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
1104 call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
1105 %slice.4 = add i32 %slice, 4
1106 call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
1116 define void @multi_vector_mul_add_single_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1117 ; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x2_s8:
1119 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
1120 ; CHECK-NEXT: mov w8, w0
1121 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
1122 ; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
1123 ; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
1125 call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1126 %slice.4 = add i32 %slice, 4
1127 call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1133 define void @multi_vector_mul_add_single_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1134 ; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x4_s8:
1136 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1137 ; CHECK-NEXT: mov w8, w0
1138 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1139 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1140 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1141 ; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
1142 ; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
1144 call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1145 %slice.4 = add i32 %slice, 4
1146 call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1152 define void @multi_vector_mul_add_lane_signed_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1153 ; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x1_s8:
1155 ; CHECK-NEXT: mov w8, w0
1156 ; CHECK-NEXT: sumlall za.s[w8, 0:3], z1.b, z2.b[0]
1157 ; CHECK-NEXT: sumlall za.s[w8, 12:15], z1.b, z2.b[15]
1159 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1160 %slice.12 = add i32 %slice, 12
1161 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1167 define void @multi_vector_mul_add_lane_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1168 ; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x2_s8:
1170 ; CHECK-NEXT: mov z5.d, z2.d
1171 ; CHECK-NEXT: mov w8, w0
1172 ; CHECK-NEXT: mov z4.d, z1.d
1173 ; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1174 ; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1176 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1177 %slice.4 = add i32 %slice, 4
1178 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1184 define void @multi_vector_mul_add_lane_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1185 ; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x4_s8:
1187 ; CHECK-NEXT: mov z27.d, z4.d
1188 ; CHECK-NEXT: mov w8, w0
1189 ; CHECK-NEXT: mov z26.d, z3.d
1190 ; CHECK-NEXT: mov z25.d, z2.d
1191 ; CHECK-NEXT: mov z24.d, z1.d
1192 ; CHECK-NEXT: sumlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1193 ; CHECK-NEXT: sumlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1195 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1196 %slice.4 = add i32 %slice, 4
1197 call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1205 define void @multi_vector_mul_add_single_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1206 ; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x1_s8:
1208 ; CHECK-NEXT: mov w8, w0
1209 ; CHECK-NEXT: usmlall za.s[w8, 0:3], z1.b, z2.b
1210 ; CHECK-NEXT: usmlall za.s[w8, 12:15], z1.b, z2.b
1212 call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
1213 %slice.12 = add i32 %slice, 12
1214 call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
1220 define void @multi_vector_mul_add_single_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1221 ; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x2_s8:
1223 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
1224 ; CHECK-NEXT: mov w8, w0
1225 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
1226 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
1227 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
1229 call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1230 %slice.4 = add i32 %slice, 4
1231 call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1237 define void @multi_vector_mul_add_single_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1238 ; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x4_s8:
1240 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1241 ; CHECK-NEXT: mov w8, w0
1242 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1243 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1244 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1245 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
1246 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
1248 call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1249 %slice.4 = add i32 %slice, 4
1250 call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1256 define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
1257 ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8:
1259 ; CHECK-NEXT: mov z5.d, z4.d
1260 ; CHECK-NEXT: mov z7.d, z2.d
1261 ; CHECK-NEXT: mov w8, w0
1262 ; CHECK-NEXT: mov z4.d, z3.d
1263 ; CHECK-NEXT: mov z6.d, z1.d
1264 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
1265 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
1267 call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
1268 %slice.4 = add i32 %slice, 4
1269 call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
1275 define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
1276 ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8:
1278 ; CHECK-NEXT: mov z26.d, z7.d
1279 ; CHECK-NEXT: mov z31.d, z4.d
1280 ; CHECK-NEXT: mov w8, w0
1281 ; CHECK-NEXT: ptrue p0.b
1282 ; CHECK-NEXT: mov z25.d, z6.d
1283 ; CHECK-NEXT: mov z30.d, z3.d
1284 ; CHECK-NEXT: mov z24.d, z5.d
1285 ; CHECK-NEXT: mov z29.d, z2.d
1286 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
1287 ; CHECK-NEXT: mov z28.d, z1.d
1288 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
1289 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
1291 call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
1292 %slice.4 = add i32 %slice, 4
1293 call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
1299 define void @multi_vector_mul_add_lane_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1300 ; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x1_s8:
1302 ; CHECK-NEXT: mov w8, w0
1303 ; CHECK-NEXT: usmlall za.s[w8, 0:3], z1.b, z2.b[0]
1304 ; CHECK-NEXT: usmlall za.s[w8, 12:15], z1.b, z2.b[15]
1306 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1307 %slice.12 = add i32 %slice, 12
1308 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1314 define void @multi_vector_mul_add_lane_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1315 ; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x2_s8:
1317 ; CHECK-NEXT: mov z5.d, z2.d
1318 ; CHECK-NEXT: mov w8, w0
1319 ; CHECK-NEXT: mov z4.d, z1.d
1320 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1321 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1323 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1324 %slice.4 = add i32 %slice, 4
1325 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1331 define void @multi_vector_mul_add_lane_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1332 ; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x4_s8:
1334 ; CHECK-NEXT: mov z27.d, z4.d
1335 ; CHECK-NEXT: mov w8, w0
1336 ; CHECK-NEXT: mov z26.d, z3.d
1337 ; CHECK-NEXT: mov z25.d, z2.d
1338 ; CHECK-NEXT: mov z24.d, z1.d
1339 ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1340 ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1342 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1343 %slice.4 = add i32 %slice, 4
1344 call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1348 declare void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1349 declare void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1350 declare void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1352 declare void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1353 declare void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1354 declare void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1356 declare void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1357 declare void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1359 declare void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1360 declare void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1362 declare void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1363 declare void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1364 declare void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1366 declare void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1367 declare void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1368 declare void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1370 declare void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1371 declare void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1372 declare void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1374 declare void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1375 declare void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1376 declare void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1378 declare void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1379 declare void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1381 declare void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1382 declare void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1384 declare void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1385 declare void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1386 declare void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1388 declare void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1389 declare void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1390 declare void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1392 declare void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1393 declare void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1394 declare void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1396 declare void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1397 declare void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1398 declare void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1400 declare void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1401 declare void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1403 declare void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1404 declare void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1406 declare void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1407 declare void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1408 declare void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1410 declare void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1411 declare void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1412 declare void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1414 declare void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1415 declare void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1416 declare void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1418 declare void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1419 declare void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1420 declare void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1422 declare void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1423 declare void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1425 declare void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1426 declare void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1428 declare void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1429 declare void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1430 declare void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1432 declare void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1433 declare void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1434 declare void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1436 declare void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1437 declare void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1439 declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1440 declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1441 declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1443 declare void @llvm.aarch64.sme.sumla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1444 declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1445 declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1447 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1448 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1449 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1451 declare void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1452 declare void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1454 declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1455 declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1456 declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1458 declare void @llvm.aarch64.sme.usmla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1459 declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1460 declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)