1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
3 ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
5 define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
7 ; CHECK: // %bb.0: // %entry
8 ; CHECK-NEXT: udot z0.s, z1.b, z2.b
11 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
12 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
13 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
14 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
15 ret <vscale x 4 x i32> %partial.reduce
18 define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
19 ; CHECK-LABEL: udot_wide:
20 ; CHECK: // %bb.0: // %entry
21 ; CHECK-NEXT: udot z0.d, z1.h, z2.h
24 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
25 %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
26 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
27 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
28 ret <vscale x 2 x i64> %partial.reduce
31 define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
33 ; CHECK: // %bb.0: // %entry
34 ; CHECK-NEXT: sdot z0.s, z1.b, z2.b
37 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
38 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
39 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
40 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
41 ret <vscale x 4 x i32> %partial.reduce
44 define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
45 ; CHECK-LABEL: sdot_wide:
46 ; CHECK: // %bb.0: // %entry
47 ; CHECK-NEXT: sdot z0.d, z1.h, z2.h
50 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
51 %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
52 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
53 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
54 ret <vscale x 2 x i64> %partial.reduce
57 define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
58 ; CHECK-I8MM-LABEL: usdot:
59 ; CHECK-I8MM: // %bb.0: // %entry
60 ; CHECK-I8MM-NEXT: usdot z0.s, z1.b, z2.b
61 ; CHECK-I8MM-NEXT: ret
63 ; CHECK-NOI8MM-LABEL: usdot:
64 ; CHECK-NOI8MM: // %bb.0: // %entry
65 ; CHECK-NOI8MM-NEXT: uunpklo z3.h, z1.b
66 ; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
67 ; CHECK-NOI8MM-NEXT: uunpkhi z1.h, z1.b
68 ; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
69 ; CHECK-NOI8MM-NEXT: ptrue p0.s
70 ; CHECK-NOI8MM-NEXT: uunpklo z5.s, z3.h
71 ; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
72 ; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
73 ; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
74 ; CHECK-NOI8MM-NEXT: uunpklo z7.s, z1.h
75 ; CHECK-NOI8MM-NEXT: uunpkhi z1.s, z1.h
76 ; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
77 ; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
78 ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
79 ; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
80 ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
81 ; CHECK-NOI8MM-NEXT: movprfx z1, z3
82 ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
83 ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
84 ; CHECK-NOI8MM-NEXT: ret
86 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
87 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
88 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
89 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
90 ret <vscale x 4 x i32> %partial.reduce
93 define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
94 ; CHECK-I8MM-LABEL: sudot:
95 ; CHECK-I8MM: // %bb.0: // %entry
96 ; CHECK-I8MM-NEXT: usdot z0.s, z2.b, z1.b
97 ; CHECK-I8MM-NEXT: ret
99 ; CHECK-NOI8MM-LABEL: sudot:
100 ; CHECK-NOI8MM: // %bb.0: // %entry
101 ; CHECK-NOI8MM-NEXT: sunpklo z3.h, z1.b
102 ; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
103 ; CHECK-NOI8MM-NEXT: sunpkhi z1.h, z1.b
104 ; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
105 ; CHECK-NOI8MM-NEXT: ptrue p0.s
106 ; CHECK-NOI8MM-NEXT: sunpklo z5.s, z3.h
107 ; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
108 ; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
109 ; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
110 ; CHECK-NOI8MM-NEXT: sunpklo z7.s, z1.h
111 ; CHECK-NOI8MM-NEXT: sunpkhi z1.s, z1.h
112 ; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
113 ; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
114 ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
115 ; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
116 ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
117 ; CHECK-NOI8MM-NEXT: movprfx z1, z3
118 ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
119 ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
120 ; CHECK-NOI8MM-NEXT: ret
122 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
123 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
124 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
125 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
126 ret <vscale x 4 x i32> %partial.reduce
129 define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
130 ; CHECK-LABEL: udot_8to64:
131 ; CHECK: // %bb.0: // %entry
132 ; CHECK-NEXT: mov z4.s, #0 // =0x0
133 ; CHECK-NEXT: udot z4.s, z2.b, z3.b
134 ; CHECK-NEXT: sunpklo z2.d, z4.s
135 ; CHECK-NEXT: sunpkhi z3.d, z4.s
136 ; CHECK-NEXT: add z0.d, z0.d, z2.d
137 ; CHECK-NEXT: add z1.d, z1.d, z3.d
140 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
141 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
142 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
143 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
144 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
145 ret <vscale x 4 x i64> %partial.reduce
148 define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
149 ; CHECK-LABEL: sdot_8to64:
150 ; CHECK: // %bb.0: // %entry
151 ; CHECK-NEXT: mov z4.s, #0 // =0x0
152 ; CHECK-NEXT: sdot z4.s, z2.b, z3.b
153 ; CHECK-NEXT: sunpklo z2.d, z4.s
154 ; CHECK-NEXT: sunpkhi z3.d, z4.s
155 ; CHECK-NEXT: add z0.d, z0.d, z2.d
156 ; CHECK-NEXT: add z1.d, z1.d, z3.d
159 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
160 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
161 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
162 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
163 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
164 ret <vscale x 4 x i64> %partial.reduce
167 define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
168 ; CHECK-I8MM-LABEL: usdot_8to64:
169 ; CHECK-I8MM: // %bb.0: // %entry
170 ; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0
171 ; CHECK-I8MM-NEXT: usdot z4.s, z2.b, z3.b
172 ; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s
173 ; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s
174 ; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d
175 ; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d
176 ; CHECK-I8MM-NEXT: ret
178 ; CHECK-NOI8MM-LABEL: usdot_8to64:
179 ; CHECK-NOI8MM: // %bb.0: // %entry
180 ; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
181 ; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
182 ; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
183 ; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
184 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
185 ; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
186 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
187 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
188 ; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
189 ; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b
190 ; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
191 ; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b
192 ; CHECK-NOI8MM-NEXT: ptrue p0.d
193 ; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
194 ; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
195 ; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h
196 ; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
197 ; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
198 ; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
199 ; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
200 ; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
201 ; CHECK-NOI8MM-NEXT: uunpkhi z26.d, z6.s
202 ; CHECK-NOI8MM-NEXT: uunpklo z6.d, z6.s
203 ; CHECK-NOI8MM-NEXT: uunpklo z27.d, z4.s
204 ; CHECK-NOI8MM-NEXT: sunpklo z28.d, z7.s
205 ; CHECK-NOI8MM-NEXT: sunpklo z29.d, z5.s
206 ; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s
207 ; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
208 ; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
209 ; CHECK-NOI8MM-NEXT: uunpkhi z30.d, z24.s
210 ; CHECK-NOI8MM-NEXT: uunpkhi z31.d, z2.s
211 ; CHECK-NOI8MM-NEXT: uunpklo z24.d, z24.s
212 ; CHECK-NOI8MM-NEXT: uunpklo z2.d, z2.s
213 ; CHECK-NOI8MM-NEXT: sunpkhi z8.d, z25.s
214 ; CHECK-NOI8MM-NEXT: sunpklo z25.d, z25.s
215 ; CHECK-NOI8MM-NEXT: sunpklo z9.d, z3.s
216 ; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
217 ; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
218 ; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s
219 ; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
220 ; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
221 ; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
222 ; CHECK-NOI8MM-NEXT: movprfx z2, z27
223 ; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
224 ; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
225 ; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
226 ; CHECK-NOI8MM-NEXT: movprfx z3, z4
227 ; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
228 ; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
229 ; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
230 ; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
231 ; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
232 ; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
233 ; CHECK-NOI8MM-NEXT: ret
235 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
236 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
237 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
238 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
239 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
240 ret <vscale x 4 x i64> %partial.reduce
243 define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
244 ; CHECK-I8MM-LABEL: sudot_8to64:
245 ; CHECK-I8MM: // %bb.0: // %entry
246 ; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0
247 ; CHECK-I8MM-NEXT: usdot z4.s, z3.b, z2.b
248 ; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s
249 ; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s
250 ; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d
251 ; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d
252 ; CHECK-I8MM-NEXT: ret
254 ; CHECK-NOI8MM-LABEL: sudot_8to64:
255 ; CHECK-NOI8MM: // %bb.0: // %entry
256 ; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
257 ; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
258 ; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
259 ; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
260 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
261 ; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
262 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
263 ; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
264 ; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
265 ; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b
266 ; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
267 ; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b
268 ; CHECK-NOI8MM-NEXT: ptrue p0.d
269 ; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
270 ; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
271 ; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h
272 ; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
273 ; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
274 ; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
275 ; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
276 ; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
277 ; CHECK-NOI8MM-NEXT: sunpkhi z26.d, z6.s
278 ; CHECK-NOI8MM-NEXT: sunpklo z6.d, z6.s
279 ; CHECK-NOI8MM-NEXT: sunpklo z27.d, z4.s
280 ; CHECK-NOI8MM-NEXT: uunpklo z28.d, z7.s
281 ; CHECK-NOI8MM-NEXT: uunpklo z29.d, z5.s
282 ; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s
283 ; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
284 ; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
285 ; CHECK-NOI8MM-NEXT: sunpkhi z30.d, z24.s
286 ; CHECK-NOI8MM-NEXT: sunpkhi z31.d, z2.s
287 ; CHECK-NOI8MM-NEXT: sunpklo z24.d, z24.s
288 ; CHECK-NOI8MM-NEXT: sunpklo z2.d, z2.s
289 ; CHECK-NOI8MM-NEXT: uunpkhi z8.d, z25.s
290 ; CHECK-NOI8MM-NEXT: uunpklo z25.d, z25.s
291 ; CHECK-NOI8MM-NEXT: uunpklo z9.d, z3.s
292 ; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
293 ; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
294 ; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s
295 ; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
296 ; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
297 ; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
298 ; CHECK-NOI8MM-NEXT: movprfx z2, z27
299 ; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
300 ; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
301 ; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
302 ; CHECK-NOI8MM-NEXT: movprfx z3, z4
303 ; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
304 ; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
305 ; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
306 ; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
307 ; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
308 ; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
309 ; CHECK-NOI8MM-NEXT: ret
311 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
312 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
313 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
314 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
315 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
316 ret <vscale x 4 x i64> %partial.reduce
319 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
320 ; CHECK-LABEL: not_udot:
321 ; CHECK: // %bb.0: // %entry
322 ; CHECK-NEXT: and z1.h, z1.h, #0xff
323 ; CHECK-NEXT: and z2.h, z2.h, #0xff
324 ; CHECK-NEXT: ptrue p0.s
325 ; CHECK-NEXT: uunpklo z3.s, z1.h
326 ; CHECK-NEXT: uunpklo z4.s, z2.h
327 ; CHECK-NEXT: uunpkhi z1.s, z1.h
328 ; CHECK-NEXT: uunpkhi z2.s, z2.h
329 ; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s
330 ; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
333 %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
334 %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
335 %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
336 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
337 ret <vscale x 4 x i32> %partial.reduce
340 define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
341 ; CHECK-LABEL: not_udot_wide:
342 ; CHECK: // %bb.0: // %entry
343 ; CHECK-NEXT: and z1.s, z1.s, #0xffff
344 ; CHECK-NEXT: and z2.s, z2.s, #0xffff
345 ; CHECK-NEXT: ptrue p0.d
346 ; CHECK-NEXT: uunpklo z3.d, z1.s
347 ; CHECK-NEXT: uunpklo z4.d, z2.s
348 ; CHECK-NEXT: uunpkhi z1.d, z1.s
349 ; CHECK-NEXT: uunpkhi z2.d, z2.s
350 ; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
351 ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
354 %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
355 %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
356 %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
357 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
358 ret <vscale x 2 x i64> %partial.reduce
361 define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
362 ; CHECK-LABEL: not_usdot:
363 ; CHECK: // %bb.0: // %entry
364 ; CHECK-NEXT: uunpklo z3.s, z1.h
365 ; CHECK-NEXT: sunpklo z4.s, z2.h
366 ; CHECK-NEXT: uunpkhi z1.s, z1.h
367 ; CHECK-NEXT: sunpkhi z2.s, z2.h
368 ; CHECK-NEXT: ptrue p0.d
369 ; CHECK-NEXT: uunpklo z5.d, z3.s
370 ; CHECK-NEXT: uunpkhi z3.d, z3.s
371 ; CHECK-NEXT: sunpklo z6.d, z4.s
372 ; CHECK-NEXT: sunpkhi z4.d, z4.s
373 ; CHECK-NEXT: uunpklo z7.d, z1.s
374 ; CHECK-NEXT: uunpkhi z1.d, z1.s
375 ; CHECK-NEXT: sunpklo z24.d, z2.s
376 ; CHECK-NEXT: sunpkhi z2.d, z2.s
377 ; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
378 ; CHECK-NEXT: mul z3.d, z3.d, z4.d
379 ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
380 ; CHECK-NEXT: movprfx z1, z3
381 ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
382 ; CHECK-NEXT: add z0.d, z1.d, z0.d
385 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
386 %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
387 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
388 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
389 ret <vscale x 2 x i64> %partial.reduce
392 define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
393 ; CHECK-LABEL: not_sudot:
394 ; CHECK: // %bb.0: // %entry
395 ; CHECK-NEXT: sunpklo z3.s, z1.h
396 ; CHECK-NEXT: uunpklo z4.s, z2.h
397 ; CHECK-NEXT: sunpkhi z1.s, z1.h
398 ; CHECK-NEXT: uunpkhi z2.s, z2.h
399 ; CHECK-NEXT: ptrue p0.d
400 ; CHECK-NEXT: sunpklo z5.d, z3.s
401 ; CHECK-NEXT: sunpkhi z3.d, z3.s
402 ; CHECK-NEXT: uunpklo z6.d, z4.s
403 ; CHECK-NEXT: uunpkhi z4.d, z4.s
404 ; CHECK-NEXT: sunpklo z7.d, z1.s
405 ; CHECK-NEXT: sunpkhi z1.d, z1.s
406 ; CHECK-NEXT: uunpklo z24.d, z2.s
407 ; CHECK-NEXT: uunpkhi z2.d, z2.s
408 ; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
409 ; CHECK-NEXT: mul z3.d, z3.d, z4.d
410 ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
411 ; CHECK-NEXT: movprfx z1, z3
412 ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
413 ; CHECK-NEXT: add z0.d, z1.d, z0.d
416 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
417 %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
418 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
419 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
420 ret <vscale x 2 x i64> %partial.reduce