1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
4 target triple="aarch64-linux-gnu"
7 ; == Multi, multi (unsigned) ==
9 define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
10 ; CHECK-LABEL: udot_multi_za32_u16_vg1x2:
12 ; CHECK-NEXT: mov z5.d, z4.d
13 ; CHECK-NEXT: mov z7.d, z2.d
14 ; CHECK-NEXT: mov w8, w0
15 ; CHECK-NEXT: mov z4.d, z3.d
16 ; CHECK-NEXT: mov z6.d, z1.d
17 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
18 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
20 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
21 %slice2 = add i32 %slice, 7
22 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
26 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
27 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
29 ; CHECK-NEXT: ptrue p0.h
30 ; CHECK-NEXT: mov z26.d, z7.d
31 ; CHECK-NEXT: mov z25.d, z6.d
32 ; CHECK-NEXT: mov z7.d, z4.d
33 ; CHECK-NEXT: mov w8, w0
34 ; CHECK-NEXT: mov z24.d, z5.d
35 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
36 ; CHECK-NEXT: mov z6.d, z3.d
37 ; CHECK-NEXT: mov z5.d, z2.d
38 ; CHECK-NEXT: mov z4.d, z1.d
39 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
40 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
42 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
43 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
44 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
45 %slice2 = add i32 %slice, 7
46 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
47 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
51 define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
52 ; CHECK-LABEL: udot_multi_za32_u8_vg1x2:
54 ; CHECK-NEXT: mov z5.d, z4.d
55 ; CHECK-NEXT: mov z7.d, z2.d
56 ; CHECK-NEXT: mov w8, w0
57 ; CHECK-NEXT: mov z4.d, z3.d
58 ; CHECK-NEXT: mov z6.d, z1.d
59 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
60 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
62 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
63 %slice2 = add i32 %slice, 7
64 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
68 define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
69 ; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
71 ; CHECK-NEXT: ptrue p0.b
72 ; CHECK-NEXT: mov z26.d, z7.d
73 ; CHECK-NEXT: mov z25.d, z6.d
74 ; CHECK-NEXT: mov z7.d, z4.d
75 ; CHECK-NEXT: mov w8, w0
76 ; CHECK-NEXT: mov z24.d, z5.d
77 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
78 ; CHECK-NEXT: mov z6.d, z3.d
79 ; CHECK-NEXT: mov z5.d, z2.d
80 ; CHECK-NEXT: mov z4.d, z1.d
81 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
82 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
84 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
85 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
86 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
87 %slice2 = add i32 %slice, 7
88 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
89 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
93 define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
94 ; CHECK-LABEL: udot_multi_za64_u16_vg1x2:
96 ; CHECK-NEXT: mov z5.d, z4.d
97 ; CHECK-NEXT: mov z7.d, z2.d
98 ; CHECK-NEXT: mov w8, w0
99 ; CHECK-NEXT: mov z4.d, z3.d
100 ; CHECK-NEXT: mov z6.d, z1.d
101 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
102 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
104 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
105 %slice2 = add i32 %slice, 7
106 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
110 define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
111 ; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
113 ; CHECK-NEXT: ptrue p0.h
114 ; CHECK-NEXT: mov z26.d, z7.d
115 ; CHECK-NEXT: mov z25.d, z6.d
116 ; CHECK-NEXT: mov z7.d, z4.d
117 ; CHECK-NEXT: mov w8, w0
118 ; CHECK-NEXT: mov z24.d, z5.d
119 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
120 ; CHECK-NEXT: mov z6.d, z3.d
121 ; CHECK-NEXT: mov z5.d, z2.d
122 ; CHECK-NEXT: mov z4.d, z1.d
123 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
124 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
126 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
127 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
128 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
129 %slice2 = add i32 %slice, 7
130 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
131 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
135 define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
136 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x2:
138 ; CHECK-NEXT: mov z5.d, z4.d
139 ; CHECK-NEXT: mov z7.d, z2.d
140 ; CHECK-NEXT: mov w8, w0
141 ; CHECK-NEXT: mov z4.d, z3.d
142 ; CHECK-NEXT: mov z6.d, z1.d
143 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
144 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
146 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
147 %slice2 = add i32 %slice, 7
148 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
152 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
153 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
155 ; CHECK-NEXT: ptrue p0.b
156 ; CHECK-NEXT: mov z26.d, z7.d
157 ; CHECK-NEXT: mov z25.d, z6.d
158 ; CHECK-NEXT: mov z7.d, z4.d
159 ; CHECK-NEXT: mov w8, w0
160 ; CHECK-NEXT: mov z24.d, z5.d
161 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
162 ; CHECK-NEXT: mov z6.d, z3.d
163 ; CHECK-NEXT: mov z5.d, z2.d
164 ; CHECK-NEXT: mov z4.d, z1.d
165 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
166 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
168 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
169 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
170 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
171 %slice2 = add i32 %slice, 7
172 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
173 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
178 ; == Multi, multi (signed) ==
180 define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
181 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x2:
183 ; CHECK-NEXT: mov z5.d, z4.d
184 ; CHECK-NEXT: mov z7.d, z2.d
185 ; CHECK-NEXT: mov w8, w0
186 ; CHECK-NEXT: mov z4.d, z3.d
187 ; CHECK-NEXT: mov z6.d, z1.d
188 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
189 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
191 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
192 %slice2 = add i32 %slice, 7
193 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
197 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
198 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
200 ; CHECK-NEXT: ptrue p0.h
201 ; CHECK-NEXT: mov z26.d, z7.d
202 ; CHECK-NEXT: mov z25.d, z6.d
203 ; CHECK-NEXT: mov z7.d, z4.d
204 ; CHECK-NEXT: mov w8, w0
205 ; CHECK-NEXT: mov z24.d, z5.d
206 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
207 ; CHECK-NEXT: mov z6.d, z3.d
208 ; CHECK-NEXT: mov z5.d, z2.d
209 ; CHECK-NEXT: mov z4.d, z1.d
210 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
211 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
213 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
214 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
215 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
216 %slice2 = add i32 %slice, 7
217 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
218 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
222 define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
223 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x2:
225 ; CHECK-NEXT: mov z5.d, z4.d
226 ; CHECK-NEXT: mov z7.d, z2.d
227 ; CHECK-NEXT: mov w8, w0
228 ; CHECK-NEXT: mov z4.d, z3.d
229 ; CHECK-NEXT: mov z6.d, z1.d
230 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
231 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
233 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
234 %slice2 = add i32 %slice, 7
235 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
239 define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
240 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
242 ; CHECK-NEXT: ptrue p0.b
243 ; CHECK-NEXT: mov z26.d, z7.d
244 ; CHECK-NEXT: mov z25.d, z6.d
245 ; CHECK-NEXT: mov z7.d, z4.d
246 ; CHECK-NEXT: mov w8, w0
247 ; CHECK-NEXT: mov z24.d, z5.d
248 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
249 ; CHECK-NEXT: mov z6.d, z3.d
250 ; CHECK-NEXT: mov z5.d, z2.d
251 ; CHECK-NEXT: mov z4.d, z1.d
252 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
253 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
255 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
256 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
257 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
258 %slice2 = add i32 %slice, 7
259 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
260 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
264 define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
265 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x2:
267 ; CHECK-NEXT: mov z5.d, z4.d
268 ; CHECK-NEXT: mov z7.d, z2.d
269 ; CHECK-NEXT: mov w8, w0
270 ; CHECK-NEXT: mov z4.d, z3.d
271 ; CHECK-NEXT: mov z6.d, z1.d
272 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
273 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
275 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
276 %slice2 = add i32 %slice, 7
277 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
281 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
282 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
284 ; CHECK-NEXT: ptrue p0.h
285 ; CHECK-NEXT: mov z26.d, z7.d
286 ; CHECK-NEXT: mov z25.d, z6.d
287 ; CHECK-NEXT: mov z7.d, z4.d
288 ; CHECK-NEXT: mov w8, w0
289 ; CHECK-NEXT: mov z24.d, z5.d
290 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
291 ; CHECK-NEXT: mov z6.d, z3.d
292 ; CHECK-NEXT: mov z5.d, z2.d
293 ; CHECK-NEXT: mov z4.d, z1.d
294 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
295 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
297 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
298 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
299 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
300 %slice2 = add i32 %slice, 7
301 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
302 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
307 ; == Multi, single (unsigned) ==
309 define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
310 ; CHECK-LABEL: udot_single_za32_u16_vg1x2:
312 ; CHECK-NEXT: mov w8, w0
313 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
314 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
316 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
317 %slice2 = add i32 %slice, 7
318 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
322 define void @udot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
323 ; CHECK-LABEL: udot_single_za32_u16_vg1x2_tuple:
324 ; CHECK: // %bb.0: // %entry
325 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
326 ; CHECK-NEXT: addvl sp, sp, #-3
327 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
328 ; CHECK-NEXT: ptrue pn8.b
329 ; CHECK-NEXT: add x9, x0, x1
330 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
331 ; CHECK-NEXT: mov w8, wzr
332 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
333 ; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0]
334 ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9]
335 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h
336 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h
337 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
338 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
339 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
340 ; CHECK-NEXT: addvl sp, sp, #3
341 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
344 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
345 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
346 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
347 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
348 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
349 %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
350 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
351 %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
352 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn)
353 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn)
357 define void @udot_single_za32_u16_vg1x2_x4load_x2tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
358 ; CHECK-LABEL: udot_single_za32_u16_vg1x2_x4load_x2tuple:
359 ; CHECK: // %bb.0: // %entry
360 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
361 ; CHECK-NEXT: addvl sp, sp, #-5
362 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
363 ; CHECK-NEXT: ptrue pn8.b
364 ; CHECK-NEXT: add x9, x0, x1
365 ; CHECK-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill
366 ; CHECK-NEXT: mov w8, wzr
367 ; CHECK-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill
368 ; CHECK-NEXT: str z10, [sp, #3, mul vl] // 16-byte Folded Spill
369 ; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill
370 ; CHECK-NEXT: ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0]
371 ; CHECK-NEXT: ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x9]
372 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h
373 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z5.h, z6.h }, z0.h
374 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h
375 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z13.h, z14.h }, z0.h
376 ; CHECK-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload
377 ; CHECK-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload
378 ; CHECK-NEXT: ldr z10, [sp, #3, mul vl] // 16-byte Folded Reload
379 ; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload
380 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
381 ; CHECK-NEXT: addvl sp, sp, #5
382 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
385 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
386 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
387 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
388 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
389 %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
390 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
391 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
392 %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
393 %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
394 %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
395 %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
396 %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
397 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %zn)
398 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %zn)
399 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %zn)
400 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %zn)
404 define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
405 ; CHECK-LABEL: udot_single_za32_u16_vg1x4:
407 ; CHECK-NEXT: mov w8, w0
408 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
409 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
411 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
412 %slice2 = add i32 %slice, 7
413 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
417 define void @udot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
418 ; CHECK-LABEL: udot_single_za32_u16_vg1x4_tuple:
419 ; CHECK: // %bb.0: // %entry
420 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
421 ; CHECK-NEXT: addvl sp, sp, #-9
422 ; CHECK-NEXT: add x9, x1, x1, lsl #1
423 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
424 ; CHECK-NEXT: ptrue pn8.b
425 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
426 ; CHECK-NEXT: add x10, x0, x1
427 ; CHECK-NEXT: mov w8, wzr
428 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
429 ; CHECK-NEXT: add x9, x0, x9
430 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
431 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
432 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
433 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
434 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
435 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
436 ; CHECK-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0]
437 ; CHECK-NEXT: ld1h { z17.h, z21.h, z25.h, z29.h }, pn8/z, [x10]
438 ; CHECK-NEXT: ld1h { z18.h, z22.h, z26.h, z30.h }, pn8/z, [x0, x1, lsl #1]
439 ; CHECK-NEXT: ld1h { z19.h, z23.h, z27.h, z31.h }, pn8/z, [x9]
440 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.h - z19.h }, z0.h
441 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.h - z23.h }, z0.h
442 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.h - z27.h }, z0.h
443 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, z0.h
444 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
445 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
446 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
447 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
448 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
449 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
450 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
451 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
452 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
453 ; CHECK-NEXT: addvl sp, sp, #9
454 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
457 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
458 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
459 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
460 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
461 %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
462 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
463 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
464 %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
465 %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
466 %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
467 %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
468 %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
469 %mul3 = shl i64 %stride, 1
470 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
471 %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4)
472 %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0
473 %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1
474 %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2
475 %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3
476 %mul5 = mul i64 %stride, 3
477 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
478 %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6)
479 %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0
480 %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1
481 %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2
482 %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3
483 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn)
484 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn)
485 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn)
486 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn)
490 define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
491 ; CHECK-LABEL: udot_single_za32_u8_vg1x2:
493 ; CHECK-NEXT: mov w8, w0
494 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
495 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
497 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
498 %slice2 = add i32 %slice, 7
499 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
503 define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
504 ; CHECK-LABEL: udot_single_za32_u8_vg1x4:
506 ; CHECK-NEXT: mov w8, w0
507 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
508 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
510 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
511 %slice2 = add i32 %slice, 7
512 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
516 define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
517 ; CHECK-LABEL: udot_single_za64_u16_vg1x2:
519 ; CHECK-NEXT: mov w8, w0
520 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
521 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
523 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
524 %slice2 = add i32 %slice, 7
525 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
529 define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
530 ; CHECK-LABEL: udot_single_za64_u16_vg1x4:
532 ; CHECK-NEXT: mov w8, w0
533 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
534 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
536 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
537 %slice2 = add i32 %slice, 7
538 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
542 define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
543 ; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
545 ; CHECK-NEXT: mov w8, w0
546 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
547 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
549 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
550 %slice2 = add i32 %slice, 7
551 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
555 define void @usdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
556 ; CHECK-LABEL: usdot_single_za32_u16_vg1x2_tuple:
557 ; CHECK: // %bb.0: // %entry
558 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
559 ; CHECK-NEXT: addvl sp, sp, #-3
560 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
561 ; CHECK-NEXT: ptrue pn8.b
562 ; CHECK-NEXT: mov w8, wzr
563 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
564 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
565 ; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0]
566 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1]
567 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b
568 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b
569 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
570 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
571 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
572 ; CHECK-NEXT: addvl sp, sp, #3
573 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
576 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
577 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
578 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
579 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
580 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
581 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
582 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
583 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
584 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn)
585 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn)
589 define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
590 ; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
592 ; CHECK-NEXT: mov w8, w0
593 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
594 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
596 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
597 %slice2 = add i32 %slice, 7
598 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
602 define void @usdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
603 ; CHECK-LABEL: usdot_single_za32_u16_vg1x4_tuple:
604 ; CHECK: // %bb.0: // %entry
605 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
606 ; CHECK-NEXT: addvl sp, sp, #-9
607 ; CHECK-NEXT: lsl x9, x1, #1
608 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
609 ; CHECK-NEXT: ptrue pn8.b
610 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
611 ; CHECK-NEXT: mov w8, wzr
612 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
613 ; CHECK-NEXT: add x10, x9, x1
614 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
615 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
616 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
617 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
618 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
619 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
620 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
621 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
622 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
623 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
624 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b
625 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b
626 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b
627 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b
628 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
629 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
630 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
631 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
632 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
633 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
634 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
635 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
636 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
637 ; CHECK-NEXT: addvl sp, sp, #9
638 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
641 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
642 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
643 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
644 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
645 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
646 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
647 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
648 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
649 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
650 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
651 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
652 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
653 %mul3 = shl i64 %stride, 1
654 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
655 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
656 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
657 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
658 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
659 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
660 %mul5 = mul i64 %stride, 3
661 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
662 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
663 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
664 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
665 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
666 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
667 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn)
668 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn)
669 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn)
670 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn)
674 ; == Multi, single (signed) ==
676 define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
677 ; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
679 ; CHECK-NEXT: mov w8, w0
680 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
681 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
683 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
684 %slice2 = add i32 %slice, 7
685 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
689 define void @sdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
690 ; CHECK-LABEL: sdot_single_za32_u16_vg1x2_tuple:
691 ; CHECK: // %bb.0: // %entry
692 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
693 ; CHECK-NEXT: addvl sp, sp, #-3
694 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
695 ; CHECK-NEXT: ptrue pn8.b
696 ; CHECK-NEXT: add x9, x0, x1
697 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
698 ; CHECK-NEXT: mov w8, wzr
699 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
700 ; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0]
701 ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9]
702 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h
703 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h
704 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
705 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
706 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
707 ; CHECK-NEXT: addvl sp, sp, #3
708 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
711 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
712 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
713 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
714 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
715 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
716 %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
717 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
718 %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
719 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn)
720 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn)
724 define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
725 ; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
727 ; CHECK-NEXT: mov w8, w0
728 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
729 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
731 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
732 %slice2 = add i32 %slice, 7
733 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
737 define void @sdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
738 ; CHECK-LABEL: sdot_single_za32_u16_vg1x4_tuple:
739 ; CHECK: // %bb.0: // %entry
740 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
741 ; CHECK-NEXT: addvl sp, sp, #-9
742 ; CHECK-NEXT: add x9, x1, x1, lsl #1
743 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
744 ; CHECK-NEXT: ptrue pn8.b
745 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
746 ; CHECK-NEXT: add x10, x0, x1
747 ; CHECK-NEXT: mov w8, wzr
748 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
749 ; CHECK-NEXT: add x9, x0, x9
750 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
751 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
752 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
753 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
754 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
755 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
756 ; CHECK-NEXT: ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x0]
757 ; CHECK-NEXT: ld1h { z17.h, z21.h, z25.h, z29.h }, pn8/z, [x10]
758 ; CHECK-NEXT: ld1h { z18.h, z22.h, z26.h, z30.h }, pn8/z, [x0, x1, lsl #1]
759 ; CHECK-NEXT: ld1h { z19.h, z23.h, z27.h, z31.h }, pn8/z, [x9]
760 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.h - z19.h }, z0.h
761 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.h - z23.h }, z0.h
762 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z0.h
763 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, z0.h
764 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
765 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
766 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
767 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
768 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
769 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
770 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
771 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
772 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
773 ; CHECK-NEXT: addvl sp, sp, #9
774 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
777 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
778 %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
779 %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
780 %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
781 %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
782 %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
783 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
784 %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
785 %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
786 %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
787 %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
788 %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
789 %mul3 = shl i64 %stride, 1
790 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
791 %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4)
792 %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0
793 %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1
794 %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2
795 %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3
796 %mul5 = mul i64 %stride, 3
797 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
798 %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6)
799 %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0
800 %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1
801 %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2
802 %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3
803 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn)
804 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn)
805 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn)
806 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn)
810 define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
811 ; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
813 ; CHECK-NEXT: mov w8, w0
814 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
815 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
817 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
818 %slice2 = add i32 %slice, 7
819 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
823 define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
824 ; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
826 ; CHECK-NEXT: mov w8, w0
827 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
828 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
830 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
831 %slice2 = add i32 %slice, 7
832 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
836 define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
837 ; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
839 ; CHECK-NEXT: mov w8, w0
840 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
841 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
843 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
844 %slice2 = add i32 %slice, 7
845 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
849 define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
850 ; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
852 ; CHECK-NEXT: mov w8, w0
853 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
854 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
856 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
857 %slice2 = add i32 %slice, 7
858 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
862 define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
863 ; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
865 ; CHECK-NEXT: mov w8, w0
866 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
867 ; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
869 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
870 %slice2 = add i32 %slice, 7
871 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
875 define void @sudot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
876 ; CHECK-LABEL: sudot_single_za32_u16_vg1x2_tuple:
877 ; CHECK: // %bb.0: // %entry
878 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
879 ; CHECK-NEXT: addvl sp, sp, #-3
880 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
881 ; CHECK-NEXT: ptrue pn8.b
882 ; CHECK-NEXT: mov w8, wzr
883 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
884 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
885 ; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0]
886 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1]
887 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b
888 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b
889 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
890 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
891 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
892 ; CHECK-NEXT: addvl sp, sp, #3
893 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
896 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
897 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
898 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
899 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
900 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
901 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
902 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
903 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
904 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn)
905 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn)
909 define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
910 ; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
912 ; CHECK-NEXT: mov w8, w0
913 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
914 ; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
916 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
917 %slice2 = add i32 %slice, 7
918 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
922 define void @sudot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
923 ; CHECK-LABEL: sudot_single_za32_u16_vg1x4_tuple:
924 ; CHECK: // %bb.0: // %entry
925 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
926 ; CHECK-NEXT: addvl sp, sp, #-9
927 ; CHECK-NEXT: lsl x9, x1, #1
928 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
929 ; CHECK-NEXT: ptrue pn8.b
930 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
931 ; CHECK-NEXT: mov w8, wzr
932 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
933 ; CHECK-NEXT: add x10, x9, x1
934 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
935 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
936 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
937 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
938 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
939 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
940 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
941 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
942 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
943 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
944 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b
945 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b
946 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b
947 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b
948 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
949 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
950 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
951 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
952 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
953 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
954 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
955 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
956 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
957 ; CHECK-NEXT: addvl sp, sp, #9
958 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
961 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
962 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
963 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
964 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
965 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
966 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
967 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
968 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
969 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
970 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
971 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
972 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
973 %mul3 = shl i64 %stride, 1
974 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
975 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
976 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
977 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
978 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
979 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
980 %mul5 = mul i64 %stride, 3
981 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
982 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
983 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
984 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
985 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
986 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
987 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn)
988 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn)
989 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn)
990 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn)
994 ; == Multi, indexed (unsigned) ==
996 define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
997 ; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
999 ; CHECK-NEXT: mov z5.d, z2.d
1000 ; CHECK-NEXT: mov z4.d, z1.d
1001 ; CHECK-NEXT: mov w8, w0
1002 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
1003 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
1005 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1006 %slice2 = add i32 %slice, 7
1007 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1011 define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
1012 ; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
1014 ; CHECK-NEXT: mov w8, w0
1015 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
1016 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
1018 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1019 <vscale x 8 x i16> %zn4, i32 3)
1020 %slice2 = add i32 %slice, 7
1021 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1022 <vscale x 8 x i16> %zn4, i32 3)
1026 define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1027 ; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
1029 ; CHECK-NEXT: mov z5.d, z2.d
1030 ; CHECK-NEXT: mov z4.d, z1.d
1031 ; CHECK-NEXT: mov w8, w0
1032 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1033 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1035 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1036 %slice2 = add i32 %slice, 7
1037 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1041 define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1042 ; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
1044 ; CHECK-NEXT: mov z27.d, z4.d
1045 ; CHECK-NEXT: mov z26.d, z3.d
1046 ; CHECK-NEXT: mov w8, w0
1047 ; CHECK-NEXT: mov z25.d, z2.d
1048 ; CHECK-NEXT: mov z24.d, z1.d
1049 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1050 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1052 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1053 <vscale x 16 x i8> %zn4, i32 3)
1054 %slice2 = add i32 %slice, 7
1055 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1056 <vscale x 16 x i8> %zn4, i32 3)
1060 define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1061 ; CHECK-LABEL: udot_form_2x_tuple:
1062 ; CHECK: // %bb.0: // %entry
1063 ; CHECK-NEXT: ptrue pn8.b
1064 ; CHECK-NEXT: mov w8, wzr
1065 ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
1066 ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1067 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1068 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1071 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1072 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1073 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1074 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1075 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1076 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1077 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1078 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1079 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1080 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1084 define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1085 ; CHECK-LABEL: udot_form_2x_tuple_svecc:
1086 ; CHECK: // %bb.0: // %entry
1087 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1088 ; CHECK-NEXT: addvl sp, sp, #-3
1089 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1090 ; CHECK-NEXT: ptrue pn8.b
1091 ; CHECK-NEXT: mov w8, wzr
1092 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
1093 ; CHECK-NEXT: ptrue p0.b
1094 ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
1095 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0]
1096 ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1]
1097 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
1098 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0]
1099 ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
1100 ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
1101 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1102 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1103 ; CHECK-NEXT: addvl sp, sp, #3
1104 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1107 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1108 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1109 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1110 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1111 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1112 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1113 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1114 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1115 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1116 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1117 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1121 define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1122 ; CHECK-LABEL: udot_form_4x_tuple:
1123 ; CHECK: // %bb.0: // %entry
1124 ; CHECK-NEXT: lsl x9, x1, #1
1125 ; CHECK-NEXT: ptrue pn8.b
1126 ; CHECK-NEXT: mov w8, wzr
1127 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1128 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1129 ; CHECK-NEXT: add x10, x9, x1
1130 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1131 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1132 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1133 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1134 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1135 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1138 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1139 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1140 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1141 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1142 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1143 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1144 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1145 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1146 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1147 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1148 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1149 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1150 %mul3 = shl i64 %stride, 1
1151 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1152 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1153 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1154 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1155 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1156 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1157 %mul5 = mul i64 %stride, 3
1158 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1159 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1160 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1161 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1162 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1163 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1164 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1165 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1166 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1167 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1171 define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1172 ; CHECK-LABEL: udot_form_4x_tuple_svecc:
1173 ; CHECK: // %bb.0: // %entry
1174 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1175 ; CHECK-NEXT: addvl sp, sp, #-9
1176 ; CHECK-NEXT: lsl x9, x1, #1
1177 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1178 ; CHECK-NEXT: ptrue pn8.b
1179 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1180 ; CHECK-NEXT: mov w8, wzr
1181 ; CHECK-NEXT: ptrue p0.b
1182 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1183 ; CHECK-NEXT: add x10, x9, x1
1184 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1185 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1186 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1187 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1188 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1189 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1190 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1191 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1192 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1193 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1194 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1195 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1196 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1197 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1198 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1199 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1200 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1201 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1202 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1203 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1204 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1205 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1206 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1207 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1208 ; CHECK-NEXT: addvl sp, sp, #9
1209 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1212 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1213 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1214 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1215 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1216 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1217 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1218 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1219 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1220 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1221 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1222 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1223 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1224 %mul3 = shl i64 %stride, 1
1225 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1226 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1227 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1228 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1229 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1230 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1231 %mul5 = mul i64 %stride, 3
1232 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1233 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1234 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1235 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1236 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1237 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1238 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1239 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1240 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1241 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1242 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1246 define void @udot_single_za32_u16_vg1x4_x2load_x4tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
1247 ; CHECK-LABEL: udot_single_za32_u16_vg1x4_x2load_x4tuple:
1248 ; CHECK: // %bb.0: // %entry
1249 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1250 ; CHECK-NEXT: addvl sp, sp, #-5
1251 ; CHECK-NEXT: lsl x9, x1, #1
1252 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1253 ; CHECK-NEXT: ptrue pn8.b
1254 ; CHECK-NEXT: str z12, [sp, #1, mul vl] // 16-byte Folded Spill
1255 ; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
1256 ; CHECK-NEXT: ptrue pn8.b
1257 ; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill
1258 ; CHECK-NEXT: add x10, x9, x1
1259 ; CHECK-NEXT: mov w8, wzr
1260 ; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0]
1261 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1]
1262 ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x9]
1263 ; CHECK-NEXT: ld1b { z4.b, z12.b }, pn8/z, [x0, x10]
1264 ; CHECK-NEXT: ptrue pn8.b
1265 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b
1266 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b
1267 ; CHECK-NEXT: ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
1268 ; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
1269 ; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload
1270 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1271 ; CHECK-NEXT: addvl sp, sp, #5
1272 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1275 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1276 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1277 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1278 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1279 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1280 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1281 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1282 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1283 %mul3 = shl i64 %stride, 1
1284 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1285 %7 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1286 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %7, 0
1287 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %7, 1
1288 %mul5 = mul i64 %stride, 3
1289 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1290 %10 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1291 %11 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %10, 0
1292 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %10, 1
1293 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %8, <vscale x 16 x i8> %11, <vscale x 16 x i8> %zn)
1294 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %9, <vscale x 16 x i8> %12, <vscale x 16 x i8> %zn)
1298 define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
1299 ; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
1301 ; CHECK-NEXT: mov z5.d, z2.d
1302 ; CHECK-NEXT: mov z4.d, z1.d
1303 ; CHECK-NEXT: mov w8, w0
1304 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
1305 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
1307 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1308 %slice2 = add i32 %slice, 7
1309 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1313 define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
1314 ; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
1316 ; CHECK-NEXT: mov z27.d, z4.d
1317 ; CHECK-NEXT: mov z26.d, z3.d
1318 ; CHECK-NEXT: mov w8, w0
1319 ; CHECK-NEXT: mov z25.d, z2.d
1320 ; CHECK-NEXT: mov z24.d, z1.d
1321 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
1322 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
1324 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1325 <vscale x 8 x i16> %zn4, i32 1)
1326 %slice2 = add i32 %slice, 7
1327 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1328 <vscale x 8 x i16> %zn4, i32 1)
1332 define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1333 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
1335 ; CHECK-NEXT: mov z5.d, z2.d
1336 ; CHECK-NEXT: mov z4.d, z1.d
1337 ; CHECK-NEXT: mov w8, w0
1338 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1339 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1341 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1342 %slice2 = add i32 %slice, 7
1343 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1347 define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1348 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
1350 ; CHECK-NEXT: mov z27.d, z4.d
1351 ; CHECK-NEXT: mov z26.d, z3.d
1352 ; CHECK-NEXT: mov w8, w0
1353 ; CHECK-NEXT: mov z25.d, z2.d
1354 ; CHECK-NEXT: mov z24.d, z1.d
1355 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1356 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1358 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1359 <vscale x 16 x i8> %zn4, i32 3)
1360 %slice2 = add i32 %slice, 7
1361 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1362 <vscale x 16 x i8> %zn4, i32 3)
1366 define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1367 ; CHECK-LABEL: usdot_form_2x_tuple:
1368 ; CHECK: // %bb.0: // %entry
1369 ; CHECK-NEXT: ptrue pn8.b
1370 ; CHECK-NEXT: mov w8, wzr
1371 ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
1372 ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1373 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1374 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1377 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1378 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1379 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1380 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1381 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1382 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1383 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1384 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1385 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1386 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1390 define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1391 ; CHECK-LABEL: usdot_form_2x_tuple_svecc:
1392 ; CHECK: // %bb.0: // %entry
1393 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1394 ; CHECK-NEXT: addvl sp, sp, #-3
1395 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1396 ; CHECK-NEXT: ptrue pn8.b
1397 ; CHECK-NEXT: mov w8, wzr
1398 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
1399 ; CHECK-NEXT: ptrue p0.b
1400 ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
1401 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0]
1402 ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1]
1403 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
1404 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0]
1405 ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
1406 ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
1407 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1408 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1409 ; CHECK-NEXT: addvl sp, sp, #3
1410 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1413 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1414 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1415 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1416 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1417 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1418 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1419 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1420 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1421 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1422 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1423 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1427 define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1428 ; CHECK-LABEL: usdot_form_4x_tuple:
1429 ; CHECK: // %bb.0: // %entry
1430 ; CHECK-NEXT: lsl x9, x1, #1
1431 ; CHECK-NEXT: ptrue pn8.b
1432 ; CHECK-NEXT: mov w8, wzr
1433 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1434 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1435 ; CHECK-NEXT: add x10, x9, x1
1436 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1437 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1438 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1439 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1440 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1441 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1444 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1445 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1446 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1447 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1448 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1449 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1450 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1451 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1452 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1453 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1454 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1455 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1456 %mul3 = shl i64 %stride, 1
1457 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1458 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1459 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1460 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1461 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1462 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1463 %mul5 = mul i64 %stride, 3
1464 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1465 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1466 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1467 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1468 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1469 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1470 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1471 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1472 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1473 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1477 define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1478 ; CHECK-LABEL: usdot_form_4x_tuple_svecc:
1479 ; CHECK: // %bb.0: // %entry
1480 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1481 ; CHECK-NEXT: addvl sp, sp, #-9
1482 ; CHECK-NEXT: lsl x9, x1, #1
1483 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1484 ; CHECK-NEXT: ptrue pn8.b
1485 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1486 ; CHECK-NEXT: mov w8, wzr
1487 ; CHECK-NEXT: ptrue p0.b
1488 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1489 ; CHECK-NEXT: add x10, x9, x1
1490 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1491 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1492 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1493 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1494 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1495 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1496 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1497 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1498 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1499 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1500 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1501 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1502 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1503 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1504 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1505 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1506 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1507 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1508 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1509 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1510 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1511 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1512 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1513 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1514 ; CHECK-NEXT: addvl sp, sp, #9
1515 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1518 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1519 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1520 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1521 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1522 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1523 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1524 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1525 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1526 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1527 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1528 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1529 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1530 %mul3 = shl i64 %stride, 1
1531 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1532 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1533 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1534 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1535 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1536 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1537 %mul5 = mul i64 %stride, 3
1538 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1539 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1540 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1541 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1542 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1543 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1544 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1545 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1546 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1547 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1548 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1552 ; == Multi, indexed (signed) ==
1554 define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
1555 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
1557 ; CHECK-NEXT: mov z5.d, z2.d
1558 ; CHECK-NEXT: mov z4.d, z1.d
1559 ; CHECK-NEXT: mov w8, w0
1560 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
1561 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
1563 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1564 %slice2 = add i32 %slice, 7
1565 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1569 define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
1570 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
1572 ; CHECK-NEXT: mov z27.d, z4.d
1573 ; CHECK-NEXT: mov z26.d, z3.d
1574 ; CHECK-NEXT: mov w8, w0
1575 ; CHECK-NEXT: mov z25.d, z2.d
1576 ; CHECK-NEXT: mov z24.d, z1.d
1577 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
1578 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
1580 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1581 <vscale x 8 x i16> %zn4, i32 3)
1582 %slice2 = add i32 %slice, 7
1583 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1584 <vscale x 8 x i16> %zn4, i32 3)
1588 define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1589 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
1591 ; CHECK-NEXT: mov z5.d, z2.d
1592 ; CHECK-NEXT: mov z4.d, z1.d
1593 ; CHECK-NEXT: mov w8, w0
1594 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1595 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1597 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1598 %slice2 = add i32 %slice, 7
1599 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1603 define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1604 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
1606 ; CHECK-NEXT: mov z27.d, z4.d
1607 ; CHECK-NEXT: mov z26.d, z3.d
1608 ; CHECK-NEXT: mov w8, w0
1609 ; CHECK-NEXT: mov z25.d, z2.d
1610 ; CHECK-NEXT: mov z24.d, z1.d
1611 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1612 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1614 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1615 <vscale x 16 x i8> %zn4, i32 3)
1616 %slice2 = add i32 %slice, 7
1617 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1618 <vscale x 16 x i8> %zn4, i32 3)
1622 define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1623 ; CHECK-LABEL: sdot_form_2x_tuple:
1624 ; CHECK: // %bb.0: // %entry
1625 ; CHECK-NEXT: ptrue pn8.b
1626 ; CHECK-NEXT: mov w8, wzr
1627 ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
1628 ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1629 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1630 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1633 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1634 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1635 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1636 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1637 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1638 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1639 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1640 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1641 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1642 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1646 define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1647 ; CHECK-LABEL: sdot_form_2x_tuple_svecc:
1648 ; CHECK: // %bb.0: // %entry
1649 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1650 ; CHECK-NEXT: addvl sp, sp, #-3
1651 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1652 ; CHECK-NEXT: ptrue pn8.b
1653 ; CHECK-NEXT: mov w8, wzr
1654 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
1655 ; CHECK-NEXT: ptrue p0.b
1656 ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
1657 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0]
1658 ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1]
1659 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
1660 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0]
1661 ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
1662 ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
1663 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1664 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1665 ; CHECK-NEXT: addvl sp, sp, #3
1666 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1669 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1670 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1671 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1672 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1673 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1674 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1675 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1676 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1677 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1678 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1679 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1683 define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1684 ; CHECK-LABEL: sdot_form_4x_tuple:
1685 ; CHECK: // %bb.0: // %entry
1686 ; CHECK-NEXT: lsl x9, x1, #1
1687 ; CHECK-NEXT: ptrue pn8.b
1688 ; CHECK-NEXT: mov w8, wzr
1689 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1690 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1691 ; CHECK-NEXT: add x10, x9, x1
1692 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1693 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1694 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1695 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1696 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1697 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1700 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1701 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1702 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1703 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1704 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1705 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1706 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1707 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1708 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1709 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1710 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1711 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1712 %mul3 = shl i64 %stride, 1
1713 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1714 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1715 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1716 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1717 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1718 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1719 %mul5 = mul i64 %stride, 3
1720 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1721 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1722 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1723 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1724 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1725 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1726 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1727 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1728 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1729 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1733 define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1734 ; CHECK-LABEL: sdot_form_4x_tuple_svecc:
1735 ; CHECK: // %bb.0: // %entry
1736 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1737 ; CHECK-NEXT: addvl sp, sp, #-9
1738 ; CHECK-NEXT: lsl x9, x1, #1
1739 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1740 ; CHECK-NEXT: ptrue pn8.b
1741 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1742 ; CHECK-NEXT: mov w8, wzr
1743 ; CHECK-NEXT: ptrue p0.b
1744 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1745 ; CHECK-NEXT: add x10, x9, x1
1746 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1747 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1748 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1749 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1750 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1751 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1752 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1753 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1754 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1755 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1756 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1757 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1758 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1759 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1760 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1761 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1762 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1763 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1764 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1765 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1766 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1767 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1768 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1769 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1770 ; CHECK-NEXT: addvl sp, sp, #9
1771 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1774 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1775 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1776 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1777 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1778 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1779 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1780 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1781 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1782 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1783 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1784 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1785 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1786 %mul3 = shl i64 %stride, 1
1787 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1788 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1789 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1790 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1791 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1792 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1793 %mul5 = mul i64 %stride, 3
1794 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1795 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1796 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1797 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1798 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1799 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1800 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1801 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1802 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1803 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1804 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1808 define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
1809 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
1811 ; CHECK-NEXT: mov z5.d, z2.d
1812 ; CHECK-NEXT: mov z4.d, z1.d
1813 ; CHECK-NEXT: mov w8, w0
1814 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
1815 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
1817 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1818 %slice2 = add i32 %slice, 7
1819 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1823 define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
1824 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
1826 ; CHECK-NEXT: mov z27.d, z4.d
1827 ; CHECK-NEXT: mov z26.d, z3.d
1828 ; CHECK-NEXT: mov w8, w0
1829 ; CHECK-NEXT: mov z25.d, z2.d
1830 ; CHECK-NEXT: mov z24.d, z1.d
1831 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
1832 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
1834 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1835 <vscale x 8 x i16> %zn4, i32 1)
1836 %slice2 = add i32 %slice, 7
1837 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1838 <vscale x 8 x i16> %zn4, i32 1)
1844 define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1845 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
1847 ; CHECK-NEXT: mov z5.d, z2.d
1848 ; CHECK-NEXT: mov z4.d, z1.d
1849 ; CHECK-NEXT: mov w8, w0
1850 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1851 ; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1853 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1854 %slice2 = add i32 %slice, 7
1855 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1859 define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1860 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
1862 ; CHECK-NEXT: mov z27.d, z4.d
1863 ; CHECK-NEXT: mov z26.d, z3.d
1864 ; CHECK-NEXT: mov w8, w0
1865 ; CHECK-NEXT: mov z25.d, z2.d
1866 ; CHECK-NEXT: mov z24.d, z1.d
1867 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1868 ; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1870 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1871 <vscale x 16 x i8> %zn4, i32 3)
1872 %slice2 = add i32 %slice, 7
1873 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1874 <vscale x 16 x i8> %zn4, i32 3)
1878 define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1879 ; CHECK-LABEL: sudot_form_2x_tuple:
1880 ; CHECK: // %bb.0: // %entry
1881 ; CHECK-NEXT: ptrue pn8.b
1882 ; CHECK-NEXT: mov w8, wzr
1883 ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
1884 ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1885 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1886 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1889 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1890 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1891 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1892 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1893 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1894 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1895 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1896 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1897 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1898 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1902 define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1903 ; CHECK-LABEL: sudot_form_2x_tuple_svecc:
1904 ; CHECK: // %bb.0: // %entry
1905 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1906 ; CHECK-NEXT: addvl sp, sp, #-3
1907 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1908 ; CHECK-NEXT: ptrue pn8.b
1909 ; CHECK-NEXT: mov w8, wzr
1910 ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
1911 ; CHECK-NEXT: ptrue p0.b
1912 ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
1913 ; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0]
1914 ; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1]
1915 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
1916 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0]
1917 ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
1918 ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
1919 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1920 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1921 ; CHECK-NEXT: addvl sp, sp, #3
1922 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1925 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1926 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1927 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1928 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1929 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1930 %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1931 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1932 %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1933 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1934 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1935 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
1939 define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1940 ; CHECK-LABEL: sudot_form_4x_tuple:
1941 ; CHECK: // %bb.0: // %entry
1942 ; CHECK-NEXT: lsl x9, x1, #1
1943 ; CHECK-NEXT: ptrue pn8.b
1944 ; CHECK-NEXT: mov w8, wzr
1945 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1946 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1947 ; CHECK-NEXT: add x10, x9, x1
1948 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1949 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1950 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1951 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1952 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1953 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1956 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1957 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1958 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1959 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1960 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1961 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1962 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1963 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1964 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1965 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1966 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1967 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1968 %mul3 = shl i64 %stride, 1
1969 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1970 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1971 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1972 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1973 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1974 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1975 %mul5 = mul i64 %stride, 3
1976 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1977 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1978 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1979 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1980 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1981 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1982 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1983 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1984 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1985 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1989 define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1990 ; CHECK-LABEL: sudot_form_4x_tuple_svecc:
1991 ; CHECK: // %bb.0: // %entry
1992 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1993 ; CHECK-NEXT: addvl sp, sp, #-9
1994 ; CHECK-NEXT: lsl x9, x1, #1
1995 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1996 ; CHECK-NEXT: ptrue pn8.b
1997 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1998 ; CHECK-NEXT: mov w8, wzr
1999 ; CHECK-NEXT: ptrue p0.b
2000 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
2001 ; CHECK-NEXT: add x10, x9, x1
2002 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
2003 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
2004 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
2005 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
2006 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
2007 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
2008 ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
2009 ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
2010 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
2011 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
2012 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
2013 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
2014 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
2015 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
2016 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
2017 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
2018 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
2019 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
2020 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
2021 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
2022 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
2023 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
2024 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
2025 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
2026 ; CHECK-NEXT: addvl sp, sp, #9
2027 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2030 %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
2031 %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
2032 %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
2033 %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
2034 %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
2035 %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
2036 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
2037 %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
2038 %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
2039 %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
2040 %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
2041 %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
2042 %mul3 = shl i64 %stride, 1
2043 %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
2044 %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
2045 %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
2046 %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
2047 %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
2048 %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
2049 %mul5 = mul i64 %stride, 3
2050 %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
2051 %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
2052 %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
2053 %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
2054 %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
2055 %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
2056 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
2057 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
2058 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
2059 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
2060 store <vscale x 16 x i8> %scalable_arg, ptr %ptr
2065 attributes #0 = { nounwind "target-features"="+sme2" }
2066 attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
2068 ; == Multi, multi (unsigned)
2070 declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2071 declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
2072 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2073 declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2074 declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
2075 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2076 declare void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2077 declare void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
2078 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2079 declare void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2080 declare void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
2081 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2083 ; == Multi, multi (signed)
2085 declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2086 declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
2087 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2088 declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2089 declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
2090 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2091 declare void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2092 declare void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
2093 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2095 ; == Multi, single (unsigned)
2097 declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2098 declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2099 declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2100 declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2101 declare void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2102 declare void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2103 declare void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2104 declare void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2106 ; == Multi, single (signed)
2108 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2109 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2110 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2111 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2112 declare void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2113 declare void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2114 declare void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2115 declare void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2117 ; == Multi, indexed (unsigned)
2119 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2120 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2121 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2122 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2123 declare void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2124 declare void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2125 declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2126 declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2128 ; == Multi, indexed (signed)
2130 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2131 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2132 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2133 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2134 declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2135 declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2136 declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2137 declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)