1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
8 define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) {
9 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32:
11 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
12 ; CHECK-NEXT: mov w8, w0
13 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
14 ; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
15 ; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
17 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice,
18 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
19 <vscale x 4 x i32> %zm)
20 %slice.7 = add i32 %slice, 7
21 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7,
22 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
23 <vscale x 4 x i32> %zm)
27 define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) {
28 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64:
30 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
31 ; CHECK-NEXT: mov w8, w0
32 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
33 ; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
34 ; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
36 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice,
37 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
38 <vscale x 2 x i64> %zm)
39 %slice.7 = add i32 %slice, 7
40 call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7,
41 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
42 <vscale x 2 x i64> %zm)
50 define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
51 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i32:
53 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
54 ; CHECK-NEXT: mov w8, w0
55 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
56 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
57 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
58 ; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
59 ; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
61 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
62 <vscale x 4 x i32> %zm) {
63 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice,
64 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
65 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
66 <vscale x 4 x i32> %zm)
67 %slice.7 = add i32 %slice, 7
68 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7,
69 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
70 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
71 <vscale x 4 x i32> %zm)
75 define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
76 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i64:
78 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
79 ; CHECK-NEXT: mov w8, w0
80 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
81 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
82 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
83 ; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
84 ; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
86 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
87 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
88 <vscale x 2 x i64> %zm) {
89 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice,
90 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
91 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
92 <vscale x 2 x i64> %zm)
93 %slice.7 = add i32 %slice, 7
94 call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7,
95 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
96 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
97 <vscale x 2 x i64> %zm)
105 define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
106 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32:
108 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
109 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
110 ; CHECK-NEXT: mov w8, w0
111 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
112 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
113 ; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
114 ; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
116 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) {
117 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice,
118 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
119 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
120 %slice.7 = add i32 %slice, 7
121 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7,
122 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
123 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
128 define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
129 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64:
131 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
132 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
133 ; CHECK-NEXT: mov w8, w0
134 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
135 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
136 ; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
137 ; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
139 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
140 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice,
141 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
142 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
143 %slice.7 = add i32 %slice, 7
144 call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7,
145 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
146 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
155 define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
156 ; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i32:
158 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
159 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
160 ; CHECK-NEXT: mov w8, w0
161 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
162 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
163 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
164 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
165 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
166 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
167 ; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
168 ; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
170 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
171 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
172 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) {
173 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice,
174 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
175 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
176 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
177 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
178 %slice.7 = add i32 %slice, 7
179 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7,
180 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
181 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
182 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
183 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
187 define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
188 ; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i64:
190 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
191 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
192 ; CHECK-NEXT: mov w8, w0
193 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
194 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
195 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
196 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
197 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
198 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
199 ; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
200 ; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
202 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
203 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
204 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) {
205 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice,
206 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
207 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
208 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
209 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
210 %slice.7 = add i32 %slice, 7
211 call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7,
212 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
213 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
214 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
215 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
220 ; ADD and accumulate into ZA
223 define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
224 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i32:
226 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
227 ; CHECK-NEXT: mov w8, w0
228 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
229 ; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }
230 ; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }
232 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
233 %slice.7 = add i32 %slice, 7
234 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
238 define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
239 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i64:
241 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
242 ; CHECK-NEXT: mov w8, w0
243 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
244 ; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }
245 ; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }
247 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
248 %slice.7 = add i32 %slice, 7
249 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
253 define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
254 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f32:
256 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
257 ; CHECK-NEXT: mov w8, w0
258 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
259 ; CHECK-NEXT: fadd za.s[w8, 0, vgx2], { z0.s, z1.s }
260 ; CHECK-NEXT: fadd za.s[w8, 7, vgx2], { z0.s, z1.s }
262 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice,
263 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
264 %slice.7 = add i32 %slice, 7
265 call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7,
266 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
270 define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
271 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f64:
273 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
274 ; CHECK-NEXT: mov w8, w0
275 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
276 ; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
277 ; CHECK-NEXT: fadd za.d[w8, 7, vgx2], { z0.d, z1.d }
279 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice,
280 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
281 %slice.7 = add i32 %slice, 7
282 call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7,
283 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
289 define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
290 ; CHECK-LABEL: multi_vector_add_za_vg1x4_i32:
292 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
293 ; CHECK-NEXT: mov w8, w0
294 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
295 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
296 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
297 ; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }
298 ; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }
300 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice,
301 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
302 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
303 %slice.7 = add i32 %slice, 7
304 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7,
305 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
306 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
310 define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) {
311 ; CHECK-LABEL: multi_vector_add_za_vg1x4_i64:
313 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
314 ; CHECK-NEXT: mov w8, w0
315 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
316 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
317 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318 ; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }
319 ; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }
321 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice,
322 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
323 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
324 %slice.7 = add i32 %slice, 7
325 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7,
326 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
327 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
331 define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
332 ; CHECK-LABEL: multi_vector_add_za_vg1x4_f32:
334 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
335 ; CHECK-NEXT: mov w8, w0
336 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
337 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
338 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
339 ; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z0.s - z3.s }
340 ; CHECK-NEXT: fadd za.s[w8, 7, vgx4], { z0.s - z3.s }
342 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice,
343 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
344 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
345 %slice.7 = add i32 %slice, 7
346 call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7,
347 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
348 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
352 define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) {
353 ; CHECK-LABEL: multi_vector_add_za_vg1x4_f64:
355 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
356 ; CHECK-NEXT: mov w8, w0
357 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
358 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
359 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
360 ; CHECK-NEXT: fadd za.d[w8, 0, vgx4], { z0.d - z3.d }
361 ; CHECK-NEXT: fadd za.d[w8, 7, vgx4], { z0.d - z3.d }
363 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice,
364 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
365 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
366 %slice.7 = add i32 %slice, 7
367 call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7,
368 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
369 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
374 ; ADD Vectors Multi-Single x2
377 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) {
378 ; CHECK-LABEL: multi_vec_add_single_x2_s8:
380 ; CHECK-NEXT: mov z5.d, z2.d
381 ; CHECK-NEXT: mov z4.d, z1.d
382 ; CHECK-NEXT: add { z4.b, z5.b }, { z4.b, z5.b }, z3.b
383 ; CHECK-NEXT: mov z0.d, z4.d
384 ; CHECK-NEXT: mov z1.d, z5.d
386 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
387 @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
388 <vscale x 16 x i8> %zm)
389 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
392 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) {
393 ; CHECK-LABEL: multi_vec_add_single_x2_s16:
395 ; CHECK-NEXT: mov z5.d, z2.d
396 ; CHECK-NEXT: mov z4.d, z1.d
397 ; CHECK-NEXT: add { z4.h, z5.h }, { z4.h, z5.h }, z3.h
398 ; CHECK-NEXT: mov z0.d, z4.d
399 ; CHECK-NEXT: mov z1.d, z5.d
401 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
402 @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
403 <vscale x 8 x i16> %zm)
404 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
407 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) {
408 ; CHECK-LABEL: multi_vec_add_single_x2_s32:
410 ; CHECK-NEXT: mov z5.d, z2.d
411 ; CHECK-NEXT: mov z4.d, z1.d
412 ; CHECK-NEXT: add { z4.s, z5.s }, { z4.s, z5.s }, z3.s
413 ; CHECK-NEXT: mov z0.d, z4.d
414 ; CHECK-NEXT: mov z1.d, z5.d
416 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
417 @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
418 <vscale x 4 x i32> %zm)
419 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
422 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) {
423 ; CHECK-LABEL: multi_vec_add_single_x2_s64:
425 ; CHECK-NEXT: mov z5.d, z2.d
426 ; CHECK-NEXT: mov z4.d, z1.d
427 ; CHECK-NEXT: add { z4.d, z5.d }, { z4.d, z5.d }, z3.d
428 ; CHECK-NEXT: mov z0.d, z4.d
429 ; CHECK-NEXT: mov z1.d, z5.d
431 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
432 @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
433 <vscale x 2 x i64> %zm)
434 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
438 ; ADD Vectors Multi-Single x4
441 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) {
442 ; CHECK-LABEL: multi_vec_add_single_x4_s8:
444 ; CHECK-NEXT: mov z27.d, z4.d
445 ; CHECK-NEXT: mov z26.d, z3.d
446 ; CHECK-NEXT: mov z25.d, z2.d
447 ; CHECK-NEXT: mov z24.d, z1.d
448 ; CHECK-NEXT: add { z24.b - z27.b }, { z24.b - z27.b }, z5.b
449 ; CHECK-NEXT: mov z0.d, z24.d
450 ; CHECK-NEXT: mov z1.d, z25.d
451 ; CHECK-NEXT: mov z2.d, z26.d
452 ; CHECK-NEXT: mov z3.d, z27.d
454 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
455 @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
456 <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
457 <vscale x 16 x i8> %zm)
458 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
461 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
462 ; CHECK-LABEL: multi_vec_add_x4_single_s16:
464 ; CHECK-NEXT: mov z27.d, z4.d
465 ; CHECK-NEXT: mov z26.d, z3.d
466 ; CHECK-NEXT: mov z25.d, z2.d
467 ; CHECK-NEXT: mov z24.d, z1.d
468 ; CHECK-NEXT: add { z24.h - z27.h }, { z24.h - z27.h }, z5.h
469 ; CHECK-NEXT: mov z0.d, z24.d
470 ; CHECK-NEXT: mov z1.d, z25.d
471 ; CHECK-NEXT: mov z2.d, z26.d
472 ; CHECK-NEXT: mov z3.d, z27.d
474 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
475 @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
476 <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
477 <vscale x 8 x i16> %zm)
478 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
481 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
482 ; CHECK-LABEL: multi_vec_add_x4_single_s32:
484 ; CHECK-NEXT: mov z27.d, z4.d
485 ; CHECK-NEXT: mov z26.d, z3.d
486 ; CHECK-NEXT: mov z25.d, z2.d
487 ; CHECK-NEXT: mov z24.d, z1.d
488 ; CHECK-NEXT: add { z24.s - z27.s }, { z24.s - z27.s }, z5.s
489 ; CHECK-NEXT: mov z0.d, z24.d
490 ; CHECK-NEXT: mov z1.d, z25.d
491 ; CHECK-NEXT: mov z2.d, z26.d
492 ; CHECK-NEXT: mov z3.d, z27.d
494 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
495 @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
496 <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
497 <vscale x 4 x i32> %zm)
498 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
501 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
502 ; CHECK-LABEL: multi_vec_add_x4_single_s64:
504 ; CHECK-NEXT: mov z27.d, z4.d
505 ; CHECK-NEXT: mov z26.d, z3.d
506 ; CHECK-NEXT: mov z25.d, z2.d
507 ; CHECK-NEXT: mov z24.d, z1.d
508 ; CHECK-NEXT: add { z24.d - z27.d }, { z24.d - z27.d }, z5.d
509 ; CHECK-NEXT: mov z0.d, z24.d
510 ; CHECK-NEXT: mov z1.d, z25.d
511 ; CHECK-NEXT: mov z2.d, z26.d
512 ; CHECK-NEXT: mov z3.d, z27.d
514 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
515 @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
516 <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
517 <vscale x 2 x i64> %zm)
518 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
520 declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
521 declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
522 declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
523 declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
524 declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
525 declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
526 declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
527 declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
528 declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>)
529 declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>)
530 declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
531 declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>)
532 declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
533 declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
534 declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
535 declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)
536 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
537 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
538 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
539 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
540 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
541 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
542 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
543 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)