1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -verify-machineinstrs < %s | FileCheck %s
8 define void @multi_vector_sub_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) {
9 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i32:
11 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
12 ; CHECK-NEXT: mov w8, w0
13 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
14 ; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
15 ; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
17 call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 %slice,
18 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
19 <vscale x 4 x i32> %zm)
20 %slice.7 = add i32 %slice, 7
21 call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 %slice.7,
22 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
23 <vscale x 4 x i32> %zm)
27 define void @multi_vector_sub_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) {
28 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i64:
30 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
31 ; CHECK-NEXT: mov w8, w0
32 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
33 ; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
34 ; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
36 call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 %slice,
37 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
38 <vscale x 2 x i64> %zm)
39 %slice.7 = add i32 %slice, 7
40 call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 %slice.7,
41 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
42 <vscale x 2 x i64> %zm)
50 define void @multi_vector_sub_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
51 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x4_i32:
53 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
54 ; CHECK-NEXT: mov w8, w0
55 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
56 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
57 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
58 ; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
59 ; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
61 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
62 <vscale x 4 x i32> %zm) {
63 call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 %slice,
64 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
65 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
66 <vscale x 4 x i32> %zm)
67 %slice.7 = add i32 %slice, 7
68 call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 %slice.7,
69 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
70 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
71 <vscale x 4 x i32> %zm)
75 define void @multi_vector_sub_write_single_za_vg1x4_i64(i32 %slice,
76 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x4_i64:
78 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
79 ; CHECK-NEXT: mov w8, w0
80 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
81 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
82 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
83 ; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
84 ; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
86 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
87 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
88 <vscale x 2 x i64> %zm) {
89 call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 %slice,
90 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
91 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
92 <vscale x 2 x i64> %zm)
93 %slice.7 = add i32 %slice, 7
94 call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 %slice.7,
95 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
96 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
97 <vscale x 2 x i64> %zm)
105 define void @multi_vector_sub_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
106 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i32:
108 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
109 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
110 ; CHECK-NEXT: mov w8, w0
111 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
112 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
113 ; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
114 ; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
116 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) {
117 call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 %slice,
118 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
119 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
120 %slice.7 = add i32 %slice, 7
121 call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 %slice.7,
122 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
123 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
128 define void @multi_vector_sub_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
129 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i64:
131 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
132 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
133 ; CHECK-NEXT: mov w8, w0
134 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
135 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
136 ; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
137 ; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
139 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
140 call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 %slice,
141 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
142 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
143 %slice.7 = add i32 %slice, 7
144 call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 %slice.7,
145 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
146 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
155 define void @multi_vector_sub_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
156 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x4_i32:
158 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
159 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
160 ; CHECK-NEXT: mov w8, w0
161 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
162 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
163 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
164 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
165 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
166 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
167 ; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
168 ; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
170 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
171 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
172 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) {
173 call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 %slice,
174 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
175 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
176 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
177 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
178 %slice.7 = add i32 %slice, 7
179 call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 %slice.7,
180 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
181 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
182 <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
183 <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
187 define void @multi_vector_sub_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
188 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x4_i64:
190 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
191 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
192 ; CHECK-NEXT: mov w8, w0
193 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
194 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
195 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
196 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
197 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
198 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
199 ; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
200 ; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
202 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
203 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
204 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) {
205 call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 %slice,
206 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
207 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
208 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
209 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
210 %slice.7 = add i32 %slice, 7
211 call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 %slice.7,
212 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
213 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
214 <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
215 <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
221 ; SUB and accumulate into ZA
225 define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
226 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_i32:
228 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
229 ; CHECK-NEXT: mov w8, w0
230 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
231 ; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }
232 ; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }
234 call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
235 %slice.7 = add i32 %slice, 7
236 call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
240 define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
241 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_i64:
243 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
244 ; CHECK-NEXT: mov w8, w0
245 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
246 ; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }
247 ; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }
249 call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
250 %slice.7 = add i32 %slice, 7
251 call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
255 define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
256 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_f32:
258 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
259 ; CHECK-NEXT: mov w8, w0
260 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
261 ; CHECK-NEXT: fsub za.s[w8, 0, vgx2], { z0.s, z1.s }
262 ; CHECK-NEXT: fsub za.s[w8, 7, vgx2], { z0.s, z1.s }
264 call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 %slice,
265 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
266 %slice.7 = add i32 %slice, 7
267 call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 %slice.7,
268 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
272 define void @multi_vector_sub_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
273 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_f64:
275 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
276 ; CHECK-NEXT: mov w8, w0
277 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
278 ; CHECK-NEXT: fsub za.d[w8, 0, vgx2], { z0.d, z1.d }
279 ; CHECK-NEXT: fsub za.d[w8, 7, vgx2], { z0.d, z1.d }
281 call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 %slice,
282 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
283 %slice.7 = add i32 %slice, 7
284 call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 %slice.7,
285 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
291 define void @multi_vector_sub_za_vg1x4_i32(i32 %slice,
292 ; CHECK-LABEL: multi_vector_sub_za_vg1x4_i32:
294 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
295 ; CHECK-NEXT: mov w8, w0
296 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
297 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
298 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
299 ; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }
300 ; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }
302 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
303 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
304 call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 %slice,
305 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
306 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
307 %slice.7 = add i32 %slice, 7
308 call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 %slice.7,
309 <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
310 <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
314 define void @multi_vector_sub_za_vg1x4_i64(i32 %slice,
315 ; CHECK-LABEL: multi_vector_sub_za_vg1x4_i64:
317 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318 ; CHECK-NEXT: mov w8, w0
319 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
320 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
321 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
322 ; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }
323 ; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }
325 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
326 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) {
327 call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 %slice,
328 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
329 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
330 %slice.7 = add i32 %slice, 7
331 call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 %slice.7,
332 <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
333 <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
337 define void @multi_vector_sub_za_vg1x4_f32(i32 %slice,
338 ; CHECK-LABEL: multi_vector_sub_za_vg1x4_f32:
340 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
341 ; CHECK-NEXT: mov w8, w0
342 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
343 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
344 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
345 ; CHECK-NEXT: fsub za.s[w8, 0, vgx4], { z0.s - z3.s }
346 ; CHECK-NEXT: fsub za.s[w8, 7, vgx4], { z0.s - z3.s }
348 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
349 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
350 call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 %slice,
351 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
352 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
353 %slice.7 = add i32 %slice, 7
354 call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 %slice.7,
355 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
356 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
360 define void @multi_vector_sub_za_vg1x4_f64(i32 %slice,
361 ; CHECK-LABEL: multi_vector_sub_za_vg1x4_f64:
363 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
364 ; CHECK-NEXT: mov w8, w0
365 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
366 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
367 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
368 ; CHECK-NEXT: fsub za.d[w8, 0, vgx4], { z0.d - z3.d }
369 ; CHECK-NEXT: fsub za.d[w8, 7, vgx4], { z0.d - z3.d }
371 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
372 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) {
373 call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 %slice,
374 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
375 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
376 %slice.7 = add i32 %slice, 7
377 call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 %slice.7,
378 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
379 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
383 declare void@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
384 declare void@llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
385 declare void@llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>,
386 <vscale x 4 x i32>, <vscale x 4 x i32>)
387 declare void@llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>,
388 <vscale x 2 x i64>, <vscale x 2 x i64>)
389 declare void@llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
390 declare void@llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
391 declare void@llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
392 declare void@llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
393 declare void@llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>)
394 declare void@llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>)
395 declare void@llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
396 declare void@llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>)
397 declare void@llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
398 declare void@llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
399 declare void@llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
400 declare void@llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)