1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -verify-machineinstrs | FileCheck %s
6 define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
7 ; CHECK-LABEL: multi_vector_add_single_vg1x2_s:
9 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
10 ; CHECK-NEXT: mov w8, w0
11 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
12 ; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
13 ; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
15 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice,
16 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
17 <vscale x 4 x float> %zm)
18 %slice.7 = add i32 %slice, 7
19 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice.7,
20 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
21 <vscale x 4 x float> %zm)
25 define void @multi_vector_add_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
26 ; CHECK-LABEL: multi_vector_add_single_vg1x2_d:
28 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
29 ; CHECK-NEXT: mov w8, w0
30 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
31 ; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
32 ; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
34 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice,
35 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
36 <vscale x 2 x double> %zm)
37 %slice.7 = add i32 %slice, 7
38 call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice.7,
39 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
40 <vscale x 2 x double> %zm)
44 define void @multi_vector_add_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
45 ; CHECK-LABEL: multi_vector_add_single_vg1x4_s:
47 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
48 ; CHECK-NEXT: mov w8, w0
49 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
50 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
51 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
52 ; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
53 ; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
55 <vscale x 4 x float> %zm) {
56 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice,
57 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
58 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
59 <vscale x 4 x float> %zm)
60 %slice.7 = add i32 %slice, 7
61 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice.7,
62 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
63 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
64 <vscale x 4 x float> %zm)
68 define void @multi_vector_add_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
69 ; CHECK-LABEL: multi_vector_add_single_vg1x4_d:
71 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
72 ; CHECK-NEXT: mov w8, w0
73 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
74 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
75 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
76 ; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
77 ; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
79 <vscale x 2 x double> %zm) {
80 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice,
81 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
82 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
83 <vscale x 2 x double> %zm)
84 %slice.7 = add i32 %slice, 7
85 call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice.7,
86 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
87 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
88 <vscale x 2 x double> %zm)
94 define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
95 ; CHECK-LABEL: multi_vector_sub_single_vg1x2_s:
97 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
98 ; CHECK-NEXT: mov w8, w0
99 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
100 ; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
101 ; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
103 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice,
104 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
105 <vscale x 4 x float> %zm)
106 %slice.7 = add i32 %slice, 7
107 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice.7,
108 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
109 <vscale x 4 x float> %zm)
113 define void @multi_vector_sub_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
114 ; CHECK-LABEL: multi_vector_sub_single_vg1x2_d:
116 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
117 ; CHECK-NEXT: mov w8, w0
118 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
119 ; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
120 ; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
122 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice,
123 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
124 <vscale x 2 x double> %zm)
125 %slice.7 = add i32 %slice, 7
126 call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice.7,
127 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
128 <vscale x 2 x double> %zm)
132 define void @multi_vector_sub_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
133 ; CHECK-LABEL: multi_vector_sub_single_vg1x4_s:
135 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
136 ; CHECK-NEXT: mov w8, w0
137 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
138 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
139 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
140 ; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
141 ; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
143 <vscale x 4 x float> %zm) {
144 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice,
145 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
146 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
147 <vscale x 4 x float> %zm)
148 %slice.7 = add i32 %slice, 7
149 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice.7,
150 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
151 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
152 <vscale x 4 x float> %zm)
156 define void @multi_vector_sub_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
157 ; CHECK-LABEL: multi_vector_sub_single_vg1x4_d:
159 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
160 ; CHECK-NEXT: mov w8, w0
161 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
162 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
163 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
164 ; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
165 ; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
167 <vscale x 2 x double> %zm) {
168 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice,
169 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
170 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
171 <vscale x 2 x double> %zm)
172 %slice.7 = add i32 %slice, 7
173 call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice.7,
174 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
175 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
176 <vscale x 2 x double> %zm)
182 define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
183 ; CHECK-LABEL: multi_vector_add_vg1x2_s:
185 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
186 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
187 ; CHECK-NEXT: mov w8, w0
188 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
189 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
190 ; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
191 ; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
193 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
194 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
195 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
196 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
197 %slice.7 = add i32 %slice, 7
198 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice.7,
199 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
200 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
204 define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
205 ; CHECK-LABEL: multi_vector_add_vg1x2_d:
207 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
208 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
209 ; CHECK-NEXT: mov w8, w0
210 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
211 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
212 ; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
213 ; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
215 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
216 call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice,
217 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
218 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
219 %slice.7 = add i32 %slice, 7
220 call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice.7,
221 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
222 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
226 ; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
227 define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
228 ; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass:
230 ; CHECK-NEXT: mov z4.d, z3.d
231 ; CHECK-NEXT: mov z6.d, z1.d
232 ; CHECK-NEXT: mov w8, w0
233 ; CHECK-NEXT: mov z5.d, z2.d
234 ; CHECK-NEXT: mov z7.d, z0.d
235 ; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s }
237 <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1) {
238 call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
239 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
240 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm0)
244 define void @multi_vector_add_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
245 ; CHECK-LABEL: multi_vector_add_vg1x4_s:
247 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
248 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
249 ; CHECK-NEXT: mov w8, w0
250 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
251 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
252 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
253 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
254 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
255 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
256 ; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
257 ; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
259 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
260 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
261 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
262 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
263 %slice.7 = add i32 %slice, 7
264 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice.7,
265 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
266 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
270 define void @multi_vector_add_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
271 ; CHECK-LABEL: multi_vector_add_vg1x4_d:
273 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
274 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
275 ; CHECK-NEXT: mov w8, w0
276 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
277 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
278 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
279 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
280 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
281 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
282 ; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
283 ; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
285 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
286 call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice,
287 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
288 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
289 %slice.7 = add i32 %slice, 7
290 call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice.7,
291 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
292 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
296 ; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
297 define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
298 ; CHECK-LABEL: multi_vector_add_vg1x4_s_regclass:
300 ; CHECK-NEXT: mov z26.d, z7.d
301 ; CHECK-NEXT: mov z30.d, z3.d
302 ; CHECK-NEXT: mov w8, w0
303 ; CHECK-NEXT: mov z25.d, z6.d
304 ; CHECK-NEXT: mov z29.d, z2.d
305 ; CHECK-NEXT: mov z24.d, z5.d
306 ; CHECK-NEXT: mov z28.d, z1.d
307 ; CHECK-NEXT: mov z27.d, z4.d
308 ; CHECK-NEXT: mov z31.d, z0.d
309 ; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z28.s - z31.s }, { z24.s - z27.s }
311 <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3) {
312 call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
313 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
314 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm0)
320 define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
321 ; CHECK-LABEL: multi_vector_sub_vg1x2_s:
323 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
324 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
325 ; CHECK-NEXT: mov w8, w0
326 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
327 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
328 ; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
329 ; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
331 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
332 call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice,
333 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
334 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
335 %slice.7 = add i32 %slice, 7
336 call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice.7,
337 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
338 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
342 define void @multi_vector_sub_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
343 ; CHECK-LABEL: multi_vector_sub_vg1x2_d:
345 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
346 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
347 ; CHECK-NEXT: mov w8, w0
348 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
349 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
350 ; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
351 ; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
353 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
354 call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice,
355 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
356 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
357 %slice.7 = add i32 %slice, 7
358 call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice.7,
359 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
360 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
364 define void @multi_vector_sub_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
365 ; CHECK-LABEL: multi_vector_sub_vg1x4_s:
367 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
368 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
369 ; CHECK-NEXT: mov w8, w0
370 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
371 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
372 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
373 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
374 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
375 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
376 ; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
377 ; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
379 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
380 call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice,
381 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
382 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
383 %slice.7 = add i32 %slice, 7
384 call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice.7,
385 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
386 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
390 define void @multi_vector_sub_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
391 ; CHECK-LABEL: multi_vector_sub_vg1x4_d:
393 ; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
394 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
395 ; CHECK-NEXT: mov w8, w0
396 ; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
397 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
398 ; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
399 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
400 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
401 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
402 ; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
403 ; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
405 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
406 call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice,
407 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
408 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
409 %slice.7 = add i32 %slice, 7
410 call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice.7,
411 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
412 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
418 define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
419 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_s:
421 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
422 ; CHECK-NEXT: mov w8, w0
423 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
424 ; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
425 ; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
427 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
428 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
429 <vscale x 4 x float> %zm, i32 3)
430 %slice.7 = add i32 %slice, 7
431 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice.7,
432 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
433 <vscale x 4 x float> %zm, i32 3)
437 define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
438 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_d:
440 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
441 ; CHECK-NEXT: mov w8, w0
442 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
443 ; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
444 ; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
446 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice,
447 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
448 <vscale x 2 x double> %zm, i32 1)
449 %slice.7 = add i32 %slice, 7
450 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice.7,
451 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
452 <vscale x 2 x double> %zm, i32 1)
456 ; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
457 define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
458 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass:
460 ; CHECK-NEXT: mov z4.d, z1.d
461 ; CHECK-NEXT: mov w8, w0
462 ; CHECK-NEXT: mov z5.d, z0.d
463 ; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3]
465 call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
466 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
467 <vscale x 4 x float> %zm, i32 3)
471 define void @multi_vector_add_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
472 ; CHECK-LABEL: multi_vector_add_lane_vg1x4_s:
474 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
475 ; CHECK-NEXT: mov w8, w0
476 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
477 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
478 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
479 ; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
480 ; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
482 <vscale x 4 x float> %zm) {
483 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
484 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
485 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
486 <vscale x 4 x float> %zm, i32 3)
487 %slice.7 = add i32 %slice, 7
488 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice.7,
489 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
490 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
491 <vscale x 4 x float> %zm, i32 3)
495 define void @multi_vector_add_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
496 ; CHECK-LABEL: multi_vector_add_lane_vg1x4_d:
498 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
499 ; CHECK-NEXT: mov w8, w0
500 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
501 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
502 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
503 ; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
504 ; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
506 <vscale x 2 x double> %zm) {
507 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice,
508 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
509 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
510 <vscale x 2 x double> %zm, i32 1)
511 %slice.7 = add i32 %slice, 7
512 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice.7,
513 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
514 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
515 <vscale x 2 x double> %zm, i32 1)
519 ; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
520 define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
521 ; CHECK-LABEL: multi_vector_add_lane_vg1x4_s_regclass:
523 ; CHECK-NEXT: mov z26.d, z3.d
524 ; CHECK-NEXT: mov w8, w0
525 ; CHECK-NEXT: mov z25.d, z2.d
526 ; CHECK-NEXT: mov z24.d, z1.d
527 ; CHECK-NEXT: mov z27.d, z0.d
528 ; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z24.s - z27.s }, z4.s[3]
530 <vscale x 4 x float> %zm) {
531 call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
532 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2,
533 <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
534 <vscale x 4 x float> %zm, i32 3)
540 define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
541 ; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s:
543 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
544 ; CHECK-NEXT: mov w8, w0
545 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
546 ; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
547 ; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
549 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice,
550 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
551 <vscale x 4 x float> %zm, i32 3)
552 %slice.7 = add i32 %slice, 7
553 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice.7,
554 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
555 <vscale x 4 x float> %zm, i32 3)
559 define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
560 ; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d:
562 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
563 ; CHECK-NEXT: mov w8, w0
564 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
565 ; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
566 ; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
568 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice,
569 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
570 <vscale x 2 x double> %zm, i32 1)
571 %slice.7 = add i32 %slice, 7
572 call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice.7,
573 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
574 <vscale x 2 x double> %zm, i32 1)
578 define void @multi_vector_sub_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
579 ; CHECK-LABEL: multi_vector_sub_lane_vg1x4_s:
581 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
582 ; CHECK-NEXT: mov w8, w0
583 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
584 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
585 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
586 ; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
587 ; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
589 <vscale x 4 x float> %zm) {
590 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice,
591 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
592 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
593 <vscale x 4 x float> %zm, i32 3)
594 %slice.7 = add i32 %slice, 7
595 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice.7,
596 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
597 <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
598 <vscale x 4 x float> %zm, i32 3)
602 define void @multi_vector_sub_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
603 ; CHECK-LABEL: multi_vector_sub_lane_vg1x4_d:
605 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
606 ; CHECK-NEXT: mov w8, w0
607 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
608 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
609 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
610 ; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
611 ; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
613 <vscale x 2 x double> %zm) {
614 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice,
615 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
616 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
617 <vscale x 2 x double> %zm, i32 1)
618 %slice.7 = add i32 %slice, 7
619 call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice.7,
620 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
621 <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
622 <vscale x 2 x double> %zm, i32 1)
626 declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
627 declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
628 declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
629 declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
631 declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
632 declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
633 declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
634 declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
636 declare void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
637 declare void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
638 declare void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
639 <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
640 declare void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
641 <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
643 declare void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
644 declare void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
645 declare void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
646 <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
647 declare void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
648 <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
650 declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
651 declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
652 declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
653 declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
655 declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
656 declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
657 declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
658 declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)