1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
4 target triple="aarch64-linux-gnu"
7 ; == Multi, multi (unsigned) ==
9 define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
10 ; CHECK-LABEL: udot_multi_za32_u16_vg1x2:
12 ; CHECK-NEXT: mov z5.d, z4.d
13 ; CHECK-NEXT: mov z7.d, z2.d
14 ; CHECK-NEXT: mov w8, w0
15 ; CHECK-NEXT: mov z4.d, z3.d
16 ; CHECK-NEXT: mov z6.d, z1.d
17 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
18 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
20 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
21 %slice2 = add i32 %slice, 7
22 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
26 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
27 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
29 ; CHECK-NEXT: mov z26.d, z7.d
30 ; CHECK-NEXT: mov z31.d, z4.d
31 ; CHECK-NEXT: mov w8, w0
32 ; CHECK-NEXT: ptrue p0.h
33 ; CHECK-NEXT: mov z25.d, z6.d
34 ; CHECK-NEXT: mov z30.d, z3.d
35 ; CHECK-NEXT: mov z24.d, z5.d
36 ; CHECK-NEXT: mov z29.d, z2.d
37 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
38 ; CHECK-NEXT: mov z28.d, z1.d
39 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
40 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
42 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
43 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
44 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
45 %slice2 = add i32 %slice, 7
46 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
47 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
51 define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
52 ; CHECK-LABEL: udot_multi_za32_u8_vg1x2:
54 ; CHECK-NEXT: mov z5.d, z4.d
55 ; CHECK-NEXT: mov z7.d, z2.d
56 ; CHECK-NEXT: mov w8, w0
57 ; CHECK-NEXT: mov z4.d, z3.d
58 ; CHECK-NEXT: mov z6.d, z1.d
59 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
60 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
62 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
63 %slice2 = add i32 %slice, 7
64 call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
68 define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
69 ; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
71 ; CHECK-NEXT: mov z26.d, z7.d
72 ; CHECK-NEXT: mov z31.d, z4.d
73 ; CHECK-NEXT: mov w8, w0
74 ; CHECK-NEXT: ptrue p0.b
75 ; CHECK-NEXT: mov z25.d, z6.d
76 ; CHECK-NEXT: mov z30.d, z3.d
77 ; CHECK-NEXT: mov z24.d, z5.d
78 ; CHECK-NEXT: mov z29.d, z2.d
79 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
80 ; CHECK-NEXT: mov z28.d, z1.d
81 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
82 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
84 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
85 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
86 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
87 %slice2 = add i32 %slice, 7
88 call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
89 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
93 define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
94 ; CHECK-LABEL: udot_multi_za64_u16_vg1x2:
96 ; CHECK-NEXT: mov z5.d, z4.d
97 ; CHECK-NEXT: mov z7.d, z2.d
98 ; CHECK-NEXT: mov w8, w0
99 ; CHECK-NEXT: mov z4.d, z3.d
100 ; CHECK-NEXT: mov z6.d, z1.d
101 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
102 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
104 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
105 %slice2 = add i32 %slice, 7
106 call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
110 define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
111 ; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
113 ; CHECK-NEXT: mov z26.d, z7.d
114 ; CHECK-NEXT: mov z31.d, z4.d
115 ; CHECK-NEXT: mov w8, w0
116 ; CHECK-NEXT: ptrue p0.h
117 ; CHECK-NEXT: mov z25.d, z6.d
118 ; CHECK-NEXT: mov z30.d, z3.d
119 ; CHECK-NEXT: mov z24.d, z5.d
120 ; CHECK-NEXT: mov z29.d, z2.d
121 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
122 ; CHECK-NEXT: mov z28.d, z1.d
123 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
124 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
126 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
127 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
128 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
129 %slice2 = add i32 %slice, 7
130 call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
131 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
135 define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
136 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x2:
138 ; CHECK-NEXT: mov z5.d, z4.d
139 ; CHECK-NEXT: mov z7.d, z2.d
140 ; CHECK-NEXT: mov w8, w0
141 ; CHECK-NEXT: mov z4.d, z3.d
142 ; CHECK-NEXT: mov z6.d, z1.d
143 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
144 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
146 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
147 %slice2 = add i32 %slice, 7
148 call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
152 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
153 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
155 ; CHECK-NEXT: mov z26.d, z7.d
156 ; CHECK-NEXT: mov z31.d, z4.d
157 ; CHECK-NEXT: mov w8, w0
158 ; CHECK-NEXT: ptrue p0.b
159 ; CHECK-NEXT: mov z25.d, z6.d
160 ; CHECK-NEXT: mov z30.d, z3.d
161 ; CHECK-NEXT: mov z24.d, z5.d
162 ; CHECK-NEXT: mov z29.d, z2.d
163 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
164 ; CHECK-NEXT: mov z28.d, z1.d
165 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
166 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
168 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
169 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
170 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
171 %slice2 = add i32 %slice, 7
172 call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
173 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
178 ; == Multi, multi (signed) ==
180 define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
181 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x2:
183 ; CHECK-NEXT: mov z5.d, z4.d
184 ; CHECK-NEXT: mov z7.d, z2.d
185 ; CHECK-NEXT: mov w8, w0
186 ; CHECK-NEXT: mov z4.d, z3.d
187 ; CHECK-NEXT: mov z6.d, z1.d
188 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
189 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
191 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
192 %slice2 = add i32 %slice, 7
193 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
197 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
198 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
200 ; CHECK-NEXT: mov z26.d, z7.d
201 ; CHECK-NEXT: mov z31.d, z4.d
202 ; CHECK-NEXT: mov w8, w0
203 ; CHECK-NEXT: ptrue p0.h
204 ; CHECK-NEXT: mov z25.d, z6.d
205 ; CHECK-NEXT: mov z30.d, z3.d
206 ; CHECK-NEXT: mov z24.d, z5.d
207 ; CHECK-NEXT: mov z29.d, z2.d
208 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
209 ; CHECK-NEXT: mov z28.d, z1.d
210 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
211 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
213 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
214 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
215 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
216 %slice2 = add i32 %slice, 7
217 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
218 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
222 define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
223 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x2:
225 ; CHECK-NEXT: mov z5.d, z4.d
226 ; CHECK-NEXT: mov z7.d, z2.d
227 ; CHECK-NEXT: mov w8, w0
228 ; CHECK-NEXT: mov z4.d, z3.d
229 ; CHECK-NEXT: mov z6.d, z1.d
230 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
231 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
233 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
234 %slice2 = add i32 %slice, 7
235 call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
239 define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
240 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
242 ; CHECK-NEXT: mov z26.d, z7.d
243 ; CHECK-NEXT: mov z31.d, z4.d
244 ; CHECK-NEXT: mov w8, w0
245 ; CHECK-NEXT: ptrue p0.b
246 ; CHECK-NEXT: mov z25.d, z6.d
247 ; CHECK-NEXT: mov z30.d, z3.d
248 ; CHECK-NEXT: mov z24.d, z5.d
249 ; CHECK-NEXT: mov z29.d, z2.d
250 ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
251 ; CHECK-NEXT: mov z28.d, z1.d
252 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
253 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
255 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
256 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
257 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
258 %slice2 = add i32 %slice, 7
259 call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
260 <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
264 define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
265 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x2:
267 ; CHECK-NEXT: mov z5.d, z4.d
268 ; CHECK-NEXT: mov z7.d, z2.d
269 ; CHECK-NEXT: mov w8, w0
270 ; CHECK-NEXT: mov z4.d, z3.d
271 ; CHECK-NEXT: mov z6.d, z1.d
272 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
273 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
275 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
276 %slice2 = add i32 %slice, 7
277 call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
281 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
282 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
284 ; CHECK-NEXT: mov z26.d, z7.d
285 ; CHECK-NEXT: mov z31.d, z4.d
286 ; CHECK-NEXT: mov w8, w0
287 ; CHECK-NEXT: ptrue p0.h
288 ; CHECK-NEXT: mov z25.d, z6.d
289 ; CHECK-NEXT: mov z30.d, z3.d
290 ; CHECK-NEXT: mov z24.d, z5.d
291 ; CHECK-NEXT: mov z29.d, z2.d
292 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
293 ; CHECK-NEXT: mov z28.d, z1.d
294 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
295 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
297 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
298 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
299 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
300 %slice2 = add i32 %slice, 7
301 call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
302 <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
307 ; == Multi, single (unsigned) ==
309 define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
310 ; CHECK-LABEL: udot_single_za32_u16_vg1x2:
312 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
313 ; CHECK-NEXT: mov w8, w0
314 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
315 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
316 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
318 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
319 %slice2 = add i32 %slice, 7
320 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
324 define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
325 ; CHECK-LABEL: udot_single_za32_u16_vg1x4:
327 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
328 ; CHECK-NEXT: mov w8, w0
329 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
330 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
331 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
332 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
333 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
335 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
336 %slice2 = add i32 %slice, 7
337 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
341 define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
342 ; CHECK-LABEL: udot_single_za32_u8_vg1x2:
344 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
345 ; CHECK-NEXT: mov w8, w0
346 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
347 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
348 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
350 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
351 %slice2 = add i32 %slice, 7
352 call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
356 define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
357 ; CHECK-LABEL: udot_single_za32_u8_vg1x4:
359 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
360 ; CHECK-NEXT: mov w8, w0
361 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
362 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
363 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
364 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
365 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
367 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
368 %slice2 = add i32 %slice, 7
369 call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
373 define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
374 ; CHECK-LABEL: udot_single_za64_u16_vg1x2:
376 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
377 ; CHECK-NEXT: mov w8, w0
378 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
379 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
380 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
382 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
383 %slice2 = add i32 %slice, 7
384 call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
388 define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
389 ; CHECK-LABEL: udot_single_za64_u16_vg1x4:
391 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
392 ; CHECK-NEXT: mov w8, w0
393 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
394 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
395 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
396 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
397 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
399 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
400 %slice2 = add i32 %slice, 7
401 call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
405 define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
406 ; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
408 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
409 ; CHECK-NEXT: mov w8, w0
410 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
411 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
412 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
414 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
415 %slice2 = add i32 %slice, 7
416 call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
420 define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
421 ; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
423 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
424 ; CHECK-NEXT: mov w8, w0
425 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
426 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
427 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
428 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
429 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
431 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
432 %slice2 = add i32 %slice, 7
433 call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
438 ; == Multi, single (signed) ==
440 define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
441 ; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
443 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
444 ; CHECK-NEXT: mov w8, w0
445 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
446 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
447 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
449 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
450 %slice2 = add i32 %slice, 7
451 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
455 define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
456 ; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
458 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
459 ; CHECK-NEXT: mov w8, w0
460 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
461 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
462 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
463 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
464 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
466 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
467 %slice2 = add i32 %slice, 7
468 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
472 define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
473 ; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
475 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
476 ; CHECK-NEXT: mov w8, w0
477 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
478 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
479 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
481 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
482 %slice2 = add i32 %slice, 7
483 call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
487 define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
488 ; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
490 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
491 ; CHECK-NEXT: mov w8, w0
492 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
493 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
494 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
495 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
496 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
498 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
499 %slice2 = add i32 %slice, 7
500 call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
504 define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
505 ; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
507 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
508 ; CHECK-NEXT: mov w8, w0
509 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
510 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
511 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
513 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
514 %slice2 = add i32 %slice, 7
515 call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
519 define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
520 ; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
522 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
523 ; CHECK-NEXT: mov w8, w0
524 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
525 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
526 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
527 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
528 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
530 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
531 %slice2 = add i32 %slice, 7
532 call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
536 define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
537 ; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
539 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
540 ; CHECK-NEXT: mov w8, w0
541 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
542 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
543 ; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
545 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
546 %slice2 = add i32 %slice, 7
547 call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
551 define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
552 ; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
554 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
555 ; CHECK-NEXT: mov w8, w0
556 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
557 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
558 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
559 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
560 ; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
562 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
563 %slice2 = add i32 %slice, 7
564 call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
568 ; == Multi, indexed (unsigned) ==
570 define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
571 ; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
573 ; CHECK-NEXT: mov z5.d, z2.d
574 ; CHECK-NEXT: mov w8, w0
575 ; CHECK-NEXT: mov z4.d, z1.d
576 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
577 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
579 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
580 %slice2 = add i32 %slice, 7
581 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
585 define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
586 ; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
588 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
589 ; CHECK-NEXT: mov w8, w0
590 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
591 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
592 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
593 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
594 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
596 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
597 <vscale x 8 x i16> %zn4, i32 3)
598 %slice2 = add i32 %slice, 7
599 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
600 <vscale x 8 x i16> %zn4, i32 3)
604 define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
605 ; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
607 ; CHECK-NEXT: mov z5.d, z2.d
608 ; CHECK-NEXT: mov w8, w0
609 ; CHECK-NEXT: mov z4.d, z1.d
610 ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
611 ; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
613 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
614 %slice2 = add i32 %slice, 7
615 call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
619 define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
620 ; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
622 ; CHECK-NEXT: mov z27.d, z4.d
623 ; CHECK-NEXT: mov w8, w0
624 ; CHECK-NEXT: mov z26.d, z3.d
625 ; CHECK-NEXT: mov z25.d, z2.d
626 ; CHECK-NEXT: mov z24.d, z1.d
627 ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
628 ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
630 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
631 <vscale x 16 x i8> %zn4, i32 3)
632 %slice2 = add i32 %slice, 7
633 call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
634 <vscale x 16 x i8> %zn4, i32 3)
638 define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
639 ; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
641 ; CHECK-NEXT: mov z5.d, z2.d
642 ; CHECK-NEXT: mov w8, w0
643 ; CHECK-NEXT: mov z4.d, z1.d
644 ; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
645 ; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
647 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
648 %slice2 = add i32 %slice, 7
649 call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
653 define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
654 ; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
656 ; CHECK-NEXT: mov z27.d, z4.d
657 ; CHECK-NEXT: mov w8, w0
658 ; CHECK-NEXT: mov z26.d, z3.d
659 ; CHECK-NEXT: mov z25.d, z2.d
660 ; CHECK-NEXT: mov z24.d, z1.d
661 ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
662 ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
664 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
665 <vscale x 8 x i16> %zn4, i32 1)
666 %slice2 = add i32 %slice, 7
667 call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
668 <vscale x 8 x i16> %zn4, i32 1)
672 define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
673 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
675 ; CHECK-NEXT: mov z5.d, z2.d
676 ; CHECK-NEXT: mov w8, w0
677 ; CHECK-NEXT: mov z4.d, z1.d
678 ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
679 ; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
681 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
682 %slice2 = add i32 %slice, 7
683 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
687 define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
688 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
690 ; CHECK-NEXT: mov z27.d, z4.d
691 ; CHECK-NEXT: mov w8, w0
692 ; CHECK-NEXT: mov z26.d, z3.d
693 ; CHECK-NEXT: mov z25.d, z2.d
694 ; CHECK-NEXT: mov z24.d, z1.d
695 ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
696 ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
698 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
699 <vscale x 16 x i8> %zn4, i32 3)
700 %slice2 = add i32 %slice, 7
701 call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
702 <vscale x 16 x i8> %zn4, i32 3)
707 ; == Multi, indexed (signed) ==
709 define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
710 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
712 ; CHECK-NEXT: mov z5.d, z2.d
713 ; CHECK-NEXT: mov w8, w0
714 ; CHECK-NEXT: mov z4.d, z1.d
715 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
716 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
718 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
719 %slice2 = add i32 %slice, 7
720 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
724 define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
725 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
727 ; CHECK-NEXT: mov z27.d, z4.d
728 ; CHECK-NEXT: mov w8, w0
729 ; CHECK-NEXT: mov z26.d, z3.d
730 ; CHECK-NEXT: mov z25.d, z2.d
731 ; CHECK-NEXT: mov z24.d, z1.d
732 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
733 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
735 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
736 <vscale x 8 x i16> %zn4, i32 3)
737 %slice2 = add i32 %slice, 7
738 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
739 <vscale x 8 x i16> %zn4, i32 3)
743 define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
744 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
746 ; CHECK-NEXT: mov z5.d, z2.d
747 ; CHECK-NEXT: mov w8, w0
748 ; CHECK-NEXT: mov z4.d, z1.d
749 ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
750 ; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
752 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
753 %slice2 = add i32 %slice, 7
754 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
758 define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
759 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
761 ; CHECK-NEXT: mov z27.d, z4.d
762 ; CHECK-NEXT: mov w8, w0
763 ; CHECK-NEXT: mov z26.d, z3.d
764 ; CHECK-NEXT: mov z25.d, z2.d
765 ; CHECK-NEXT: mov z24.d, z1.d
766 ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
767 ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
769 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
770 <vscale x 16 x i8> %zn4, i32 3)
771 %slice2 = add i32 %slice, 7
772 call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
773 <vscale x 16 x i8> %zn4, i32 3)
777 define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
778 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
780 ; CHECK-NEXT: mov z5.d, z2.d
781 ; CHECK-NEXT: mov w8, w0
782 ; CHECK-NEXT: mov z4.d, z1.d
783 ; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
784 ; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
786 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
787 %slice2 = add i32 %slice, 7
788 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
792 define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
793 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
795 ; CHECK-NEXT: mov z27.d, z4.d
796 ; CHECK-NEXT: mov w8, w0
797 ; CHECK-NEXT: mov z26.d, z3.d
798 ; CHECK-NEXT: mov z25.d, z2.d
799 ; CHECK-NEXT: mov z24.d, z1.d
800 ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
801 ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
803 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
804 <vscale x 8 x i16> %zn4, i32 1)
805 %slice2 = add i32 %slice, 7
806 call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
807 <vscale x 8 x i16> %zn4, i32 1)
813 define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
814 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
816 ; CHECK-NEXT: mov z5.d, z2.d
817 ; CHECK-NEXT: mov w8, w0
818 ; CHECK-NEXT: mov z4.d, z1.d
819 ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
820 ; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
822 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
823 %slice2 = add i32 %slice, 7
824 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
828 define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
829 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
831 ; CHECK-NEXT: mov z27.d, z4.d
832 ; CHECK-NEXT: mov w8, w0
833 ; CHECK-NEXT: mov z26.d, z3.d
834 ; CHECK-NEXT: mov z25.d, z2.d
835 ; CHECK-NEXT: mov z24.d, z1.d
836 ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
837 ; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
839 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
840 <vscale x 16 x i8> %zn4, i32 3)
841 %slice2 = add i32 %slice, 7
842 call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
843 <vscale x 16 x i8> %zn4, i32 3)
848 attributes #0 = { nounwind "target-features"="+sme2" }
849 attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
852 ; == Multi, multi (unsigned)
854 declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
855 declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
856 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
857 declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
858 declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
859 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
860 declare void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
861 declare void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
862 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
863 declare void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
864 declare void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
865 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
867 ; == Multi, multi (signed)
869 declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
870 declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
871 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
872 declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
873 declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
874 <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
875 declare void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
876 declare void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
877 <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
879 ; == Multi, single (unsigned)
881 declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
882 declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
883 declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
884 declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
885 declare void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
886 declare void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
887 declare void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
888 declare void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
890 ; == Multi, single (signed)
892 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
893 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
894 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
895 declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
896 declare void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
897 declare void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
898 declare void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
899 declare void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
901 ; == Multi, indexed (unsigned)
903 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
904 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
905 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
906 declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
907 declare void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
908 declare void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
909 declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
910 declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
912 ; == Multi, indexed (signed)
914 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
915 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
916 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
917 declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
918 declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
919 declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
920 declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
921 declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)