1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3 ; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
6 declare i32 @llvm.vector.reduce.add.v5i32(<5 x i32>)
7 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
8 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
9 declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
10 declare i32 @llvm.vector.reduce.add.v25i32(<25 x i32>)
11 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
12 declare i32 @llvm.vector.reduce.add.v33i32(<33 x i32>)
13 declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
14 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
16 define i32 @test_udot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
17 ; CHECK-SD-LABEL: test_udot_v4i8:
18 ; CHECK-SD: // %bb.0: // %entry
19 ; CHECK-SD-NEXT: ldr s0, [x0]
20 ; CHECK-SD-NEXT: ldr s1, [x1]
21 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
22 ; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
23 ; CHECK-SD-NEXT: umull v0.4s, v1.4h, v0.4h
24 ; CHECK-SD-NEXT: addv s0, v0.4s
25 ; CHECK-SD-NEXT: fmov w8, s0
26 ; CHECK-SD-NEXT: add w0, w8, w2
29 ; CHECK-GI-LABEL: test_udot_v4i8:
30 ; CHECK-GI: // %bb.0: // %entry
31 ; CHECK-GI-NEXT: ldr w8, [x0]
32 ; CHECK-GI-NEXT: ldr w9, [x1]
33 ; CHECK-GI-NEXT: fmov s0, w8
34 ; CHECK-GI-NEXT: fmov s2, w9
35 ; CHECK-GI-NEXT: uxtb w8, w8
36 ; CHECK-GI-NEXT: uxtb w9, w9
37 ; CHECK-GI-NEXT: mov b1, v0.b[1]
38 ; CHECK-GI-NEXT: mov b3, v0.b[2]
39 ; CHECK-GI-NEXT: mov b5, v2.b[2]
40 ; CHECK-GI-NEXT: mov b4, v0.b[3]
41 ; CHECK-GI-NEXT: mov b0, v2.b[1]
42 ; CHECK-GI-NEXT: mov b6, v2.b[3]
43 ; CHECK-GI-NEXT: fmov s2, w9
44 ; CHECK-GI-NEXT: fmov w10, s1
45 ; CHECK-GI-NEXT: fmov w11, s3
46 ; CHECK-GI-NEXT: fmov s1, w8
47 ; CHECK-GI-NEXT: fmov w13, s5
48 ; CHECK-GI-NEXT: fmov w8, s4
49 ; CHECK-GI-NEXT: fmov w12, s0
50 ; CHECK-GI-NEXT: uxtb w10, w10
51 ; CHECK-GI-NEXT: uxtb w11, w11
52 ; CHECK-GI-NEXT: uxtb w13, w13
53 ; CHECK-GI-NEXT: uxtb w8, w8
54 ; CHECK-GI-NEXT: uxtb w12, w12
55 ; CHECK-GI-NEXT: mov v1.h[1], w10
56 ; CHECK-GI-NEXT: fmov w10, s6
57 ; CHECK-GI-NEXT: fmov s0, w11
58 ; CHECK-GI-NEXT: fmov s3, w13
59 ; CHECK-GI-NEXT: mov v2.h[1], w12
60 ; CHECK-GI-NEXT: uxtb w10, w10
61 ; CHECK-GI-NEXT: mov v0.h[1], w8
62 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
63 ; CHECK-GI-NEXT: mov v3.h[1], w10
64 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
65 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
66 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
67 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
68 ; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
69 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s
70 ; CHECK-GI-NEXT: addv s0, v0.4s
71 ; CHECK-GI-NEXT: fmov w8, s0
72 ; CHECK-GI-NEXT: add w0, w8, w2
75 %0 = load <4 x i8>, ptr %a
76 %1 = zext <4 x i8> %0 to <4 x i32>
77 %2 = load <4 x i8>, ptr %b
78 %3 = zext <4 x i8> %2 to <4 x i32>
79 %4 = mul nuw nsw <4 x i32> %3, %1
80 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
81 %op.extra = add i32 %5, %sum
85 define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) {
86 ; CHECK-SD-LABEL: test_udot_v4i8_nomla:
87 ; CHECK-SD: // %bb.0: // %entry
88 ; CHECK-SD-NEXT: ldr s0, [x0]
89 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
90 ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
91 ; CHECK-SD-NEXT: addv s0, v0.4s
92 ; CHECK-SD-NEXT: fmov w0, s0
95 ; CHECK-GI-LABEL: test_udot_v4i8_nomla:
96 ; CHECK-GI: // %bb.0: // %entry
97 ; CHECK-GI-NEXT: ldr w8, [x0]
98 ; CHECK-GI-NEXT: fmov s0, w8
99 ; CHECK-GI-NEXT: mov b1, v0.b[1]
100 ; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
101 ; CHECK-GI-NEXT: mov b3, v0.b[2]
102 ; CHECK-GI-NEXT: mov b0, v0.b[3]
103 ; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
104 ; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
105 ; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
106 ; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
107 ; CHECK-GI-NEXT: uaddlv s0, v0.4h
108 ; CHECK-GI-NEXT: fmov w8, s0
109 ; CHECK-GI-NEXT: and w0, w8, #0xffff
112 %0 = load <4 x i8>, ptr %a1
113 %1 = zext <4 x i8> %0 to <4 x i32>
114 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
117 define i32 @test_sdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
118 ; CHECK-SD-LABEL: test_sdot_v4i8:
119 ; CHECK-SD: // %bb.0: // %entry
120 ; CHECK-SD-NEXT: ldr s0, [x0]
121 ; CHECK-SD-NEXT: ldr s1, [x1]
122 ; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
123 ; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
124 ; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h
125 ; CHECK-SD-NEXT: addv s0, v0.4s
126 ; CHECK-SD-NEXT: fmov w8, s0
127 ; CHECK-SD-NEXT: add w0, w8, w2
130 ; CHECK-GI-LABEL: test_sdot_v4i8:
131 ; CHECK-GI: // %bb.0: // %entry
132 ; CHECK-GI-NEXT: ldr w8, [x0]
133 ; CHECK-GI-NEXT: ldr w9, [x1]
134 ; CHECK-GI-NEXT: fmov s0, w8
135 ; CHECK-GI-NEXT: fmov s2, w9
136 ; CHECK-GI-NEXT: sxtb w8, w8
137 ; CHECK-GI-NEXT: sxtb w9, w9
138 ; CHECK-GI-NEXT: mov b1, v0.b[1]
139 ; CHECK-GI-NEXT: mov b3, v0.b[2]
140 ; CHECK-GI-NEXT: mov b5, v2.b[2]
141 ; CHECK-GI-NEXT: mov b4, v0.b[3]
142 ; CHECK-GI-NEXT: mov b0, v2.b[1]
143 ; CHECK-GI-NEXT: mov b6, v2.b[3]
144 ; CHECK-GI-NEXT: fmov s2, w9
145 ; CHECK-GI-NEXT: fmov w10, s1
146 ; CHECK-GI-NEXT: fmov w11, s3
147 ; CHECK-GI-NEXT: fmov s1, w8
148 ; CHECK-GI-NEXT: fmov w13, s5
149 ; CHECK-GI-NEXT: fmov w8, s4
150 ; CHECK-GI-NEXT: fmov w12, s0
151 ; CHECK-GI-NEXT: sxtb w10, w10
152 ; CHECK-GI-NEXT: sxtb w11, w11
153 ; CHECK-GI-NEXT: sxtb w13, w13
154 ; CHECK-GI-NEXT: sxtb w8, w8
155 ; CHECK-GI-NEXT: sxtb w12, w12
156 ; CHECK-GI-NEXT: mov v1.h[1], w10
157 ; CHECK-GI-NEXT: fmov w10, s6
158 ; CHECK-GI-NEXT: fmov s0, w11
159 ; CHECK-GI-NEXT: fmov s3, w13
160 ; CHECK-GI-NEXT: mov v2.h[1], w12
161 ; CHECK-GI-NEXT: sxtb w10, w10
162 ; CHECK-GI-NEXT: mov v0.h[1], w8
163 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
164 ; CHECK-GI-NEXT: mov v3.h[1], w10
165 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
166 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
167 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
168 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
169 ; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
170 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s
171 ; CHECK-GI-NEXT: addv s0, v0.4s
172 ; CHECK-GI-NEXT: fmov w8, s0
173 ; CHECK-GI-NEXT: add w0, w8, w2
176 %0 = load <4 x i8>, ptr %a
177 %1 = sext <4 x i8> %0 to <4 x i32>
178 %2 = load <4 x i8>, ptr %b
179 %3 = sext <4 x i8> %2 to <4 x i32>
180 %4 = mul nsw <4 x i32> %3, %1
181 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
182 %op.extra = add nsw i32 %5, %sum
186 define i32 @test_sdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
187 ; CHECK-SD-LABEL: test_sdot_v4i8_double:
188 ; CHECK-SD: // %bb.0: // %entry
189 ; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
190 ; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
191 ; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
192 ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
193 ; CHECK-SD-NEXT: shl v2.4s, v2.4s, #24
194 ; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24
195 ; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
196 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
197 ; CHECK-SD-NEXT: sshr v2.4s, v2.4s, #24
198 ; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24
199 ; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24
200 ; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
201 ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s
202 ; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
203 ; CHECK-SD-NEXT: addv s0, v2.4s
204 ; CHECK-SD-NEXT: fmov w0, s0
207 ; CHECK-GI-LABEL: test_sdot_v4i8_double:
208 ; CHECK-GI: // %bb.0: // %entry
209 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
210 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
211 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
212 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
213 ; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24
214 ; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24
215 ; CHECK-GI-NEXT: shl v2.4s, v2.4s, #24
216 ; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24
217 ; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24
218 ; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24
219 ; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #24
220 ; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24
221 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
222 ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
223 ; CHECK-GI-NEXT: addv s0, v0.4s
224 ; CHECK-GI-NEXT: addv s1, v1.4s
225 ; CHECK-GI-NEXT: fmov w8, s0
226 ; CHECK-GI-NEXT: fmov w9, s1
227 ; CHECK-GI-NEXT: add w0, w8, w9
230 %az = sext <4 x i8> %a to <4 x i32>
231 %bz = sext <4 x i8> %b to <4 x i32>
232 %m1 = mul nuw nsw <4 x i32> %az, %bz
233 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
234 %cz = sext <4 x i8> %c to <4 x i32>
235 %dz = sext <4 x i8> %d to <4 x i32>
236 %m2 = mul nuw nsw <4 x i32> %cz, %dz
237 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
238 %x = add i32 %r1, %r2
242 define i32 @test_sdot_v4i8_double_nomla(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
243 ; CHECK-SD-LABEL: test_sdot_v4i8_double_nomla:
244 ; CHECK-SD: // %bb.0: // %entry
245 ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
246 ; CHECK-SD-NEXT: ushll v1.4s, v2.4h, #0
247 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
248 ; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
249 ; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
250 ; CHECK-SD-NEXT: ssra v0.4s, v1.4s, #24
251 ; CHECK-SD-NEXT: addv s0, v0.4s
252 ; CHECK-SD-NEXT: fmov w0, s0
255 ; CHECK-GI-LABEL: test_sdot_v4i8_double_nomla:
256 ; CHECK-GI: // %bb.0: // %entry
257 ; CHECK-GI-NEXT: shl v1.4h, v2.4h, #8
258 ; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
259 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
260 ; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
261 ; CHECK-GI-NEXT: saddlv s1, v1.4h
262 ; CHECK-GI-NEXT: saddlv s0, v0.4h
263 ; CHECK-GI-NEXT: fmov w8, s1
264 ; CHECK-GI-NEXT: fmov w9, s0
265 ; CHECK-GI-NEXT: sxth w8, w8
266 ; CHECK-GI-NEXT: add w0, w8, w9, sxth
269 %az = sext <4 x i8> %a to <4 x i32>
270 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %az)
271 %cz = sext <4 x i8> %c to <4 x i32>
272 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %cz)
273 %x = add i32 %r1, %r2
277 define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
278 ; CHECK-SD-LABEL: test_usdot_v4i8:
279 ; CHECK-SD: // %bb.0: // %entry
280 ; CHECK-SD-NEXT: ldr s0, [x0]
281 ; CHECK-SD-NEXT: ldr s1, [x1]
282 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
283 ; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
284 ; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h
285 ; CHECK-SD-NEXT: addv s0, v0.4s
286 ; CHECK-SD-NEXT: fmov w8, s0
287 ; CHECK-SD-NEXT: add w0, w8, w2
290 ; CHECK-GI-LABEL: test_usdot_v4i8:
291 ; CHECK-GI: // %bb.0: // %entry
292 ; CHECK-GI-NEXT: ldr w8, [x0]
293 ; CHECK-GI-NEXT: ldr w9, [x1]
294 ; CHECK-GI-NEXT: fmov s0, w8
295 ; CHECK-GI-NEXT: fmov s2, w9
296 ; CHECK-GI-NEXT: uxtb w8, w8
297 ; CHECK-GI-NEXT: sxtb w9, w9
298 ; CHECK-GI-NEXT: mov b1, v0.b[1]
299 ; CHECK-GI-NEXT: mov b3, v0.b[2]
300 ; CHECK-GI-NEXT: mov b5, v2.b[2]
301 ; CHECK-GI-NEXT: mov b4, v0.b[3]
302 ; CHECK-GI-NEXT: mov b0, v2.b[1]
303 ; CHECK-GI-NEXT: mov b6, v2.b[3]
304 ; CHECK-GI-NEXT: fmov s2, w9
305 ; CHECK-GI-NEXT: fmov w10, s1
306 ; CHECK-GI-NEXT: fmov w11, s3
307 ; CHECK-GI-NEXT: fmov s1, w8
308 ; CHECK-GI-NEXT: fmov w13, s5
309 ; CHECK-GI-NEXT: fmov w8, s4
310 ; CHECK-GI-NEXT: fmov w12, s0
311 ; CHECK-GI-NEXT: uxtb w10, w10
312 ; CHECK-GI-NEXT: uxtb w11, w11
313 ; CHECK-GI-NEXT: sxtb w13, w13
314 ; CHECK-GI-NEXT: uxtb w8, w8
315 ; CHECK-GI-NEXT: sxtb w12, w12
316 ; CHECK-GI-NEXT: mov v1.h[1], w10
317 ; CHECK-GI-NEXT: fmov w10, s6
318 ; CHECK-GI-NEXT: fmov s0, w11
319 ; CHECK-GI-NEXT: fmov s3, w13
320 ; CHECK-GI-NEXT: mov v2.h[1], w12
321 ; CHECK-GI-NEXT: sxtb w10, w10
322 ; CHECK-GI-NEXT: mov v0.h[1], w8
323 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
324 ; CHECK-GI-NEXT: mov v3.h[1], w10
325 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
326 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
327 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
328 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
329 ; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
330 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s
331 ; CHECK-GI-NEXT: addv s0, v0.4s
332 ; CHECK-GI-NEXT: fmov w8, s0
333 ; CHECK-GI-NEXT: add w0, w8, w2
336 %0 = load <4 x i8>, ptr %a
337 %1 = zext <4 x i8> %0 to <4 x i32>
338 %2 = load <4 x i8>, ptr %b
339 %3 = sext <4 x i8> %2 to <4 x i32>
340 %4 = mul nsw <4 x i32> %3, %1
341 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
342 %op.extra = add nsw i32 %5, %sum
346 define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
347 ; CHECK-SD-LABEL: test_usdot_v4i8_double:
348 ; CHECK-SD: // %bb.0: // %entry
349 ; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
350 ; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
351 ; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
352 ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
353 ; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24
354 ; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
355 ; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
356 ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
357 ; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24
358 ; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24
359 ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s
360 ; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
361 ; CHECK-SD-NEXT: addv s0, v2.4s
362 ; CHECK-SD-NEXT: fmov w0, s0
365 ; CHECK-GI-LABEL: test_usdot_v4i8_double:
366 ; CHECK-GI: // %bb.0: // %entry
367 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
368 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
369 ; CHECK-GI-NEXT: movi v4.2d, #0x0000ff000000ff
370 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
371 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
372 ; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24
373 ; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24
374 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
375 ; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b
376 ; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24
377 ; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24
378 ; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
379 ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
380 ; CHECK-GI-NEXT: addv s0, v0.4s
381 ; CHECK-GI-NEXT: addv s1, v1.4s
382 ; CHECK-GI-NEXT: fmov w8, s0
383 ; CHECK-GI-NEXT: fmov w9, s1
384 ; CHECK-GI-NEXT: add w0, w8, w9
387 %az = zext <4 x i8> %a to <4 x i32>
388 %bz = sext <4 x i8> %b to <4 x i32>
389 %m1 = mul nuw nsw <4 x i32> %az, %bz
390 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
391 %cz = zext <4 x i8> %c to <4 x i32>
392 %dz = sext <4 x i8> %d to <4 x i32>
393 %m2 = mul nuw nsw <4 x i32> %cz, %dz
394 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
395 %x = add i32 %r1, %r2
399 define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
400 ; CHECK-SD-LABEL: test_udot_v5i8:
401 ; CHECK-SD: // %bb.0: // %entry
402 ; CHECK-SD-NEXT: ldr d0, [x0]
403 ; CHECK-SD-NEXT: ldr d1, [x1]
404 ; CHECK-SD-NEXT: umull v0.8h, v1.8b, v0.8b
405 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
406 ; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
407 ; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
408 ; CHECK-SD-NEXT: uaddw v0.4s, v1.4s, v0.4h
409 ; CHECK-SD-NEXT: addv s0, v0.4s
410 ; CHECK-SD-NEXT: fmov w8, s0
411 ; CHECK-SD-NEXT: add w0, w8, w2
414 ; CHECK-GI-LABEL: test_udot_v5i8:
415 ; CHECK-GI: // %bb.0: // %entry
416 ; CHECK-GI-NEXT: ldrb w8, [x0, #4]
417 ; CHECK-GI-NEXT: ldrb w9, [x1, #4]
418 ; CHECK-GI-NEXT: ldrb w10, [x1]
419 ; CHECK-GI-NEXT: mul w8, w9, w8
420 ; CHECK-GI-NEXT: ldrb w9, [x0]
421 ; CHECK-GI-NEXT: mov v0.s[0], w10
422 ; CHECK-GI-NEXT: mov v1.s[0], w9
423 ; CHECK-GI-NEXT: ldrb w9, [x1, #1]
424 ; CHECK-GI-NEXT: mov v2.s[0], w8
425 ; CHECK-GI-NEXT: ldrb w8, [x0, #1]
426 ; CHECK-GI-NEXT: mov v0.s[1], w9
427 ; CHECK-GI-NEXT: ldrb w9, [x1, #2]
428 ; CHECK-GI-NEXT: mov v1.s[1], w8
429 ; CHECK-GI-NEXT: ldrb w8, [x0, #2]
430 ; CHECK-GI-NEXT: mov v2.s[1], wzr
431 ; CHECK-GI-NEXT: mov v0.s[2], w9
432 ; CHECK-GI-NEXT: ldrb w9, [x1, #3]
433 ; CHECK-GI-NEXT: mov v1.s[2], w8
434 ; CHECK-GI-NEXT: ldrb w8, [x0, #3]
435 ; CHECK-GI-NEXT: mov v2.s[2], wzr
436 ; CHECK-GI-NEXT: mov v0.s[3], w9
437 ; CHECK-GI-NEXT: mov v1.s[3], w8
438 ; CHECK-GI-NEXT: mov v2.s[3], wzr
439 ; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
440 ; CHECK-GI-NEXT: addv s0, v2.4s
441 ; CHECK-GI-NEXT: fmov w8, s0
442 ; CHECK-GI-NEXT: add w0, w8, w2
445 %0 = load <5 x i8>, ptr %a
446 %1 = zext <5 x i8> %0 to <5 x i32>
447 %2 = load <5 x i8>, ptr %b
448 %3 = zext <5 x i8> %2 to <5 x i32>
449 %4 = mul nuw nsw <5 x i32> %3, %1
450 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
451 %op.extra = add i32 %5, %sum
455 define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
456 ; CHECK-SD-LABEL: test_udot_v5i8_nomla:
457 ; CHECK-SD: // %bb.0: // %entry
458 ; CHECK-SD-NEXT: ldr d0, [x0]
459 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
460 ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
461 ; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
462 ; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
463 ; CHECK-SD-NEXT: uaddw v0.4s, v1.4s, v0.4h
464 ; CHECK-SD-NEXT: addv s0, v0.4s
465 ; CHECK-SD-NEXT: fmov w0, s0
468 ; CHECK-GI-LABEL: test_udot_v5i8_nomla:
469 ; CHECK-GI: // %bb.0: // %entry
470 ; CHECK-GI-NEXT: ldrb w8, [x0]
471 ; CHECK-GI-NEXT: ldrb w9, [x0, #4]
472 ; CHECK-GI-NEXT: mov v0.s[0], w8
473 ; CHECK-GI-NEXT: mov v1.s[0], w9
474 ; CHECK-GI-NEXT: ldrb w8, [x0, #1]
475 ; CHECK-GI-NEXT: mov v0.s[1], w8
476 ; CHECK-GI-NEXT: mov v1.s[1], wzr
477 ; CHECK-GI-NEXT: ldrb w8, [x0, #2]
478 ; CHECK-GI-NEXT: mov v0.s[2], w8
479 ; CHECK-GI-NEXT: mov v1.s[2], wzr
480 ; CHECK-GI-NEXT: ldrb w8, [x0, #3]
481 ; CHECK-GI-NEXT: mov v0.s[3], w8
482 ; CHECK-GI-NEXT: mov v1.s[3], wzr
483 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
484 ; CHECK-GI-NEXT: addv s0, v0.4s
485 ; CHECK-GI-NEXT: fmov w0, s0
488 %0 = load <5 x i8>, ptr %a1
489 %1 = zext <5 x i8> %0 to <5 x i32>
490 %2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %1)
493 define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
494 ; CHECK-SD-LABEL: test_sdot_v5i8:
495 ; CHECK-SD: // %bb.0: // %entry
496 ; CHECK-SD-NEXT: ldr d0, [x0]
497 ; CHECK-SD-NEXT: ldr d1, [x1]
498 ; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
499 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
500 ; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
501 ; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
502 ; CHECK-SD-NEXT: saddw v0.4s, v1.4s, v0.4h
503 ; CHECK-SD-NEXT: addv s0, v0.4s
504 ; CHECK-SD-NEXT: fmov w8, s0
505 ; CHECK-SD-NEXT: add w0, w8, w2
508 ; CHECK-GI-LABEL: test_sdot_v5i8:
509 ; CHECK-GI: // %bb.0: // %entry
510 ; CHECK-GI-NEXT: ldrsb w8, [x0, #4]
511 ; CHECK-GI-NEXT: ldrsb w9, [x1, #4]
512 ; CHECK-GI-NEXT: ldrsb w10, [x1]
513 ; CHECK-GI-NEXT: mul w8, w9, w8
514 ; CHECK-GI-NEXT: ldrsb w9, [x0]
515 ; CHECK-GI-NEXT: mov v0.s[0], w10
516 ; CHECK-GI-NEXT: mov v1.s[0], w9
517 ; CHECK-GI-NEXT: ldrsb w9, [x1, #1]
518 ; CHECK-GI-NEXT: mov v2.s[0], w8
519 ; CHECK-GI-NEXT: ldrsb w8, [x0, #1]
520 ; CHECK-GI-NEXT: mov v0.s[1], w9
521 ; CHECK-GI-NEXT: ldrsb w9, [x1, #2]
522 ; CHECK-GI-NEXT: mov v1.s[1], w8
523 ; CHECK-GI-NEXT: ldrsb w8, [x0, #2]
524 ; CHECK-GI-NEXT: mov v2.s[1], wzr
525 ; CHECK-GI-NEXT: mov v0.s[2], w9
526 ; CHECK-GI-NEXT: ldrsb w9, [x1, #3]
527 ; CHECK-GI-NEXT: mov v1.s[2], w8
528 ; CHECK-GI-NEXT: ldrsb w8, [x0, #3]
529 ; CHECK-GI-NEXT: mov v2.s[2], wzr
530 ; CHECK-GI-NEXT: mov v0.s[3], w9
531 ; CHECK-GI-NEXT: mov v1.s[3], w8
532 ; CHECK-GI-NEXT: mov v2.s[3], wzr
533 ; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
534 ; CHECK-GI-NEXT: addv s0, v2.4s
535 ; CHECK-GI-NEXT: fmov w8, s0
536 ; CHECK-GI-NEXT: add w0, w8, w2
539 %0 = load <5 x i8>, ptr %a
540 %1 = sext <5 x i8> %0 to <5 x i32>
541 %2 = load <5 x i8>, ptr %b
542 %3 = sext <5 x i8> %2 to <5 x i32>
543 %4 = mul nsw <5 x i32> %3, %1
544 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
545 %op.extra = add nsw i32 %5, %sum
549 define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
550 ; CHECK-SD-LABEL: test_sdot_v5i8_double:
551 ; CHECK-SD: // %bb.0: // %entry
552 ; CHECK-SD-NEXT: smull v2.8h, v2.8b, v3.8b
553 ; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b
554 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
555 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
556 ; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0
557 ; CHECK-SD-NEXT: sshll2 v5.4s, v2.8h, #0
558 ; CHECK-SD-NEXT: mov v3.s[0], v4.s[0]
559 ; CHECK-SD-NEXT: mov v1.s[0], v5.s[0]
560 ; CHECK-SD-NEXT: saddw v0.4s, v3.4s, v0.4h
561 ; CHECK-SD-NEXT: saddw v1.4s, v1.4s, v2.4h
562 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
563 ; CHECK-SD-NEXT: addv s0, v0.4s
564 ; CHECK-SD-NEXT: fmov w0, s0
567 ; CHECK-GI-LABEL: test_sdot_v5i8_double:
568 ; CHECK-GI: // %bb.0: // %entry
569 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
570 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
571 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
572 ; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
573 ; CHECK-GI-NEXT: smov w8, v0.b[4]
574 ; CHECK-GI-NEXT: smov w9, v1.b[4]
575 ; CHECK-GI-NEXT: smov w10, v2.b[4]
576 ; CHECK-GI-NEXT: smov w11, v3.b[4]
577 ; CHECK-GI-NEXT: smov w12, v0.b[0]
578 ; CHECK-GI-NEXT: smov w13, v1.b[0]
579 ; CHECK-GI-NEXT: smov w14, v2.b[0]
580 ; CHECK-GI-NEXT: smov w15, v3.b[0]
581 ; CHECK-GI-NEXT: mul w8, w8, w9
582 ; CHECK-GI-NEXT: smov w9, v0.b[1]
583 ; CHECK-GI-NEXT: mul w10, w10, w11
584 ; CHECK-GI-NEXT: smov w11, v1.b[1]
585 ; CHECK-GI-NEXT: mov v4.s[0], w12
586 ; CHECK-GI-NEXT: smov w12, v2.b[1]
587 ; CHECK-GI-NEXT: mov v5.s[0], w13
588 ; CHECK-GI-NEXT: smov w13, v3.b[1]
589 ; CHECK-GI-NEXT: mov v6.s[0], w8
590 ; CHECK-GI-NEXT: mov v7.s[0], w14
591 ; CHECK-GI-NEXT: mov v16.s[0], w15
592 ; CHECK-GI-NEXT: mov v17.s[0], w10
593 ; CHECK-GI-NEXT: smov w8, v0.b[2]
594 ; CHECK-GI-NEXT: smov w10, v1.b[2]
595 ; CHECK-GI-NEXT: smov w14, v2.b[2]
596 ; CHECK-GI-NEXT: smov w15, v3.b[2]
597 ; CHECK-GI-NEXT: mov v4.s[1], w9
598 ; CHECK-GI-NEXT: mov v5.s[1], w11
599 ; CHECK-GI-NEXT: smov w9, v0.b[3]
600 ; CHECK-GI-NEXT: smov w11, v1.b[3]
601 ; CHECK-GI-NEXT: mov v6.s[1], wzr
602 ; CHECK-GI-NEXT: mov v7.s[1], w12
603 ; CHECK-GI-NEXT: mov v16.s[1], w13
604 ; CHECK-GI-NEXT: mov v17.s[1], wzr
605 ; CHECK-GI-NEXT: smov w12, v2.b[3]
606 ; CHECK-GI-NEXT: smov w13, v3.b[3]
607 ; CHECK-GI-NEXT: mov v4.s[2], w8
608 ; CHECK-GI-NEXT: mov v5.s[2], w10
609 ; CHECK-GI-NEXT: mov v6.s[2], wzr
610 ; CHECK-GI-NEXT: mov v7.s[2], w14
611 ; CHECK-GI-NEXT: mov v16.s[2], w15
612 ; CHECK-GI-NEXT: mov v17.s[2], wzr
613 ; CHECK-GI-NEXT: mov v4.s[3], w9
614 ; CHECK-GI-NEXT: mov v5.s[3], w11
615 ; CHECK-GI-NEXT: mov v6.s[3], wzr
616 ; CHECK-GI-NEXT: mov v7.s[3], w12
617 ; CHECK-GI-NEXT: mov v16.s[3], w13
618 ; CHECK-GI-NEXT: mov v17.s[3], wzr
619 ; CHECK-GI-NEXT: mla v6.4s, v4.4s, v5.4s
620 ; CHECK-GI-NEXT: mla v17.4s, v7.4s, v16.4s
621 ; CHECK-GI-NEXT: addv s0, v6.4s
622 ; CHECK-GI-NEXT: addv s1, v17.4s
623 ; CHECK-GI-NEXT: fmov w8, s0
624 ; CHECK-GI-NEXT: fmov w9, s1
625 ; CHECK-GI-NEXT: add w0, w8, w9
628 %az = sext <5 x i8> %a to <5 x i32>
629 %bz = sext <5 x i8> %b to <5 x i32>
630 %m1 = mul nuw nsw <5 x i32> %az, %bz
631 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m1)
632 %cz = sext <5 x i8> %c to <5 x i32>
633 %dz = sext <5 x i8> %d to <5 x i32>
634 %m2 = mul nuw nsw <5 x i32> %cz, %dz
635 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m2)
636 %x = add i32 %r1, %r2
640 define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
641 ; CHECK-SD-LABEL: test_sdot_v5i8_double_nomla:
642 ; CHECK-SD: // %bb.0: // %entry
643 ; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
644 ; CHECK-SD-NEXT: sshll v1.8h, v2.8b, #0
645 ; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
646 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
647 ; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0
648 ; CHECK-SD-NEXT: sshll2 v5.4s, v1.8h, #0
649 ; CHECK-SD-NEXT: mov v3.s[0], v4.s[0]
650 ; CHECK-SD-NEXT: mov v2.s[0], v5.s[0]
651 ; CHECK-SD-NEXT: saddw v0.4s, v3.4s, v0.4h
652 ; CHECK-SD-NEXT: saddw v1.4s, v2.4s, v1.4h
653 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
654 ; CHECK-SD-NEXT: addv s0, v0.4s
655 ; CHECK-SD-NEXT: fmov w0, s0
658 ; CHECK-GI-LABEL: test_sdot_v5i8_double_nomla:
659 ; CHECK-GI: // %bb.0: // %entry
660 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
661 ; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
662 ; CHECK-GI-NEXT: smov w8, v0.b[0]
663 ; CHECK-GI-NEXT: smov w9, v0.b[4]
664 ; CHECK-GI-NEXT: smov w10, v2.b[0]
665 ; CHECK-GI-NEXT: smov w11, v2.b[4]
666 ; CHECK-GI-NEXT: smov w12, v0.b[1]
667 ; CHECK-GI-NEXT: mov v1.s[0], w8
668 ; CHECK-GI-NEXT: smov w8, v2.b[1]
669 ; CHECK-GI-NEXT: mov v3.s[0], w9
670 ; CHECK-GI-NEXT: mov v4.s[0], w10
671 ; CHECK-GI-NEXT: mov v5.s[0], w11
672 ; CHECK-GI-NEXT: smov w9, v0.b[2]
673 ; CHECK-GI-NEXT: smov w10, v2.b[2]
674 ; CHECK-GI-NEXT: smov w11, v2.b[3]
675 ; CHECK-GI-NEXT: mov v1.s[1], w12
676 ; CHECK-GI-NEXT: mov v3.s[1], wzr
677 ; CHECK-GI-NEXT: mov v4.s[1], w8
678 ; CHECK-GI-NEXT: mov v5.s[1], wzr
679 ; CHECK-GI-NEXT: smov w8, v0.b[3]
680 ; CHECK-GI-NEXT: mov v1.s[2], w9
681 ; CHECK-GI-NEXT: mov v3.s[2], wzr
682 ; CHECK-GI-NEXT: mov v4.s[2], w10
683 ; CHECK-GI-NEXT: mov v5.s[2], wzr
684 ; CHECK-GI-NEXT: mov v1.s[3], w8
685 ; CHECK-GI-NEXT: mov v3.s[3], wzr
686 ; CHECK-GI-NEXT: mov v4.s[3], w11
687 ; CHECK-GI-NEXT: mov v5.s[3], wzr
688 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v3.4s
689 ; CHECK-GI-NEXT: add v1.4s, v4.4s, v5.4s
690 ; CHECK-GI-NEXT: addv s0, v0.4s
691 ; CHECK-GI-NEXT: addv s1, v1.4s
692 ; CHECK-GI-NEXT: fmov w8, s0
693 ; CHECK-GI-NEXT: fmov w9, s1
694 ; CHECK-GI-NEXT: add w0, w8, w9
697 %az = sext <5 x i8> %a to <5 x i32>
698 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %az)
699 %cz = sext <5 x i8> %c to <5 x i32>
700 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %cz)
701 %x = add i32 %r1, %r2
705 define i32 @test_udot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
706 ; CHECK-LABEL: test_udot_v8i8:
707 ; CHECK: // %bb.0: // %entry
708 ; CHECK-NEXT: movi v0.2d, #0000000000000000
709 ; CHECK-NEXT: ldr d1, [x0]
710 ; CHECK-NEXT: ldr d2, [x1]
711 ; CHECK-NEXT: udot v0.2s, v2.8b, v1.8b
712 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
713 ; CHECK-NEXT: fmov w0, s0
716 %0 = load <8 x i8>, ptr %a
717 %1 = zext <8 x i8> %0 to <8 x i32>
718 %2 = load <8 x i8>, ptr %b
719 %3 = zext <8 x i8> %2 to <8 x i32>
720 %4 = mul nuw nsw <8 x i32> %3, %1
721 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
725 define i32 @test_udot_v8i8_nomla(ptr nocapture readonly %a1) {
726 ; CHECK-SD-LABEL: test_udot_v8i8_nomla:
727 ; CHECK-SD: // %bb.0: // %entry
728 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
729 ; CHECK-SD-NEXT: movi v1.8b, #1
730 ; CHECK-SD-NEXT: ldr d2, [x0]
731 ; CHECK-SD-NEXT: udot v0.2s, v2.8b, v1.8b
732 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
733 ; CHECK-SD-NEXT: fmov w0, s0
736 ; CHECK-GI-LABEL: test_udot_v8i8_nomla:
737 ; CHECK-GI: // %bb.0: // %entry
738 ; CHECK-GI-NEXT: movi v0.8b, #1
739 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
740 ; CHECK-GI-NEXT: ldr d2, [x0]
741 ; CHECK-GI-NEXT: udot v1.2s, v2.8b, v0.8b
742 ; CHECK-GI-NEXT: addp v0.2s, v1.2s, v1.2s
743 ; CHECK-GI-NEXT: fmov w0, s0
746 %0 = load <8 x i8>, ptr %a1
747 %1 = zext <8 x i8> %0 to <8 x i32>
748 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
752 define i32 @test_sdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
753 ; CHECK-LABEL: test_sdot_v8i8:
754 ; CHECK: // %bb.0: // %entry
755 ; CHECK-NEXT: movi v0.2d, #0000000000000000
756 ; CHECK-NEXT: ldr d1, [x0]
757 ; CHECK-NEXT: ldr d2, [x1]
758 ; CHECK-NEXT: sdot v0.2s, v2.8b, v1.8b
759 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
760 ; CHECK-NEXT: fmov w0, s0
763 %0 = load <8 x i8>, ptr %a
764 %1 = sext <8 x i8> %0 to <8 x i32>
765 %2 = load <8 x i8>, ptr %b
766 %3 = sext <8 x i8> %2 to <8 x i32>
767 %4 = mul nsw <8 x i32> %3, %1
768 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
772 define i32 @test_sdot_v8i8_nomla(ptr nocapture readonly %a1) {
773 ; CHECK-SD-LABEL: test_sdot_v8i8_nomla:
774 ; CHECK-SD: // %bb.0: // %entry
775 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
776 ; CHECK-SD-NEXT: movi v1.8b, #1
777 ; CHECK-SD-NEXT: ldr d2, [x0]
778 ; CHECK-SD-NEXT: sdot v0.2s, v2.8b, v1.8b
779 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
780 ; CHECK-SD-NEXT: fmov w0, s0
783 ; CHECK-GI-LABEL: test_sdot_v8i8_nomla:
784 ; CHECK-GI: // %bb.0: // %entry
785 ; CHECK-GI-NEXT: movi v0.8b, #1
786 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
787 ; CHECK-GI-NEXT: ldr d2, [x0]
788 ; CHECK-GI-NEXT: sdot v1.2s, v2.8b, v0.8b
789 ; CHECK-GI-NEXT: addp v0.2s, v1.2s, v1.2s
790 ; CHECK-GI-NEXT: fmov w0, s0
793 %0 = load <8 x i8>, ptr %a1
794 %1 = sext <8 x i8> %0 to <8 x i32>
795 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
799 define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
800 ; CHECK-SD-LABEL: test_usdot_v8i8:
801 ; CHECK-SD: // %bb.0: // %entry
802 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
803 ; CHECK-SD-NEXT: ldr d1, [x0]
804 ; CHECK-SD-NEXT: ldr d2, [x1]
805 ; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b
806 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
807 ; CHECK-SD-NEXT: fmov w0, s0
810 ; CHECK-GI-LABEL: test_usdot_v8i8:
811 ; CHECK-GI: // %bb.0: // %entry
812 ; CHECK-GI-NEXT: ldr d0, [x0]
813 ; CHECK-GI-NEXT: ldr d1, [x1]
814 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
815 ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
816 ; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0
817 ; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0
818 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
819 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
820 ; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s
821 ; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s
822 ; CHECK-GI-NEXT: addv s0, v2.4s
823 ; CHECK-GI-NEXT: fmov w0, s0
826 %0 = load <8 x i8>, ptr %a
827 %1 = zext <8 x i8> %0 to <8 x i32>
828 %2 = load <8 x i8>, ptr %b
829 %3 = sext <8 x i8> %2 to <8 x i32>
830 %4 = mul nsw <8 x i32> %3, %1
831 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
835 define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
836 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8:
837 ; CHECK-SD: // %bb.0: // %entry
838 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
839 ; CHECK-SD-NEXT: ldr d1, [x0]
840 ; CHECK-SD-NEXT: ldr d2, [x1]
841 ; CHECK-SD-NEXT: usdot v0.2s, v2.8b, v1.8b
842 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
843 ; CHECK-SD-NEXT: fmov w0, s0
846 ; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8:
847 ; CHECK-GI: // %bb.0: // %entry
848 ; CHECK-GI-NEXT: ldr d0, [x0]
849 ; CHECK-GI-NEXT: ldr d1, [x1]
850 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
851 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
852 ; CHECK-GI-NEXT: sshll2 v2.4s, v0.8h, #0
853 ; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0
854 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
855 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
856 ; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s
857 ; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s
858 ; CHECK-GI-NEXT: addv s0, v2.4s
859 ; CHECK-GI-NEXT: fmov w0, s0
862 %0 = load <8 x i8>, ptr %a
863 %1 = sext <8 x i8> %0 to <8 x i32>
864 %2 = load <8 x i8>, ptr %b
865 %3 = zext <8 x i8> %2 to <8 x i32>
866 %4 = mul nsw <8 x i32> %3, %1
867 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
871 define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
872 ; CHECK-LABEL: test_udot_v16i8:
873 ; CHECK: // %bb.0: // %entry
874 ; CHECK-NEXT: movi v0.2d, #0000000000000000
875 ; CHECK-NEXT: ldr q1, [x0]
876 ; CHECK-NEXT: ldr q2, [x1]
877 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
878 ; CHECK-NEXT: addv s0, v0.4s
879 ; CHECK-NEXT: fmov w8, s0
880 ; CHECK-NEXT: add w0, w8, w2
883 %0 = load <16 x i8>, ptr %a
884 %1 = zext <16 x i8> %0 to <16 x i32>
885 %2 = load <16 x i8>, ptr %b
886 %3 = zext <16 x i8> %2 to <16 x i32>
887 %4 = mul nuw nsw <16 x i32> %3, %1
888 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
889 %op.extra = add i32 %5, %sum
893 define i32 @test_udot_v16i8_nomla(ptr nocapture readonly %a1) {
894 ; CHECK-LABEL: test_udot_v16i8_nomla:
895 ; CHECK: // %bb.0: // %entry
896 ; CHECK-NEXT: movi v0.16b, #1
897 ; CHECK-NEXT: movi v1.2d, #0000000000000000
898 ; CHECK-NEXT: ldr q2, [x0]
899 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
900 ; CHECK-NEXT: addv s0, v1.4s
901 ; CHECK-NEXT: fmov w0, s0
904 %0 = load <16 x i8>, ptr %a1
905 %1 = zext <16 x i8> %0 to <16 x i32>
906 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
910 define i32 @test_sdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
911 ; CHECK-LABEL: test_sdot_v16i8:
912 ; CHECK: // %bb.0: // %entry
913 ; CHECK-NEXT: movi v0.2d, #0000000000000000
914 ; CHECK-NEXT: ldr q1, [x0]
915 ; CHECK-NEXT: ldr q2, [x1]
916 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
917 ; CHECK-NEXT: addv s0, v0.4s
918 ; CHECK-NEXT: fmov w8, s0
919 ; CHECK-NEXT: add w0, w8, w2
922 %0 = load <16 x i8>, ptr %a
923 %1 = sext <16 x i8> %0 to <16 x i32>
924 %2 = load <16 x i8>, ptr %b
925 %3 = sext <16 x i8> %2 to <16 x i32>
926 %4 = mul nsw <16 x i32> %3, %1
927 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
928 %op.extra = add nsw i32 %5, %sum
932 define i32 @test_sdot_v16i8_nomla(ptr nocapture readonly %a1) {
933 ; CHECK-LABEL: test_sdot_v16i8_nomla:
934 ; CHECK: // %bb.0: // %entry
935 ; CHECK-NEXT: movi v0.16b, #1
936 ; CHECK-NEXT: movi v1.2d, #0000000000000000
937 ; CHECK-NEXT: ldr q2, [x0]
938 ; CHECK-NEXT: sdot v1.4s, v2.16b, v0.16b
939 ; CHECK-NEXT: addv s0, v1.4s
940 ; CHECK-NEXT: fmov w0, s0
943 %0 = load <16 x i8>, ptr %a1
944 %1 = sext <16 x i8> %0 to <16 x i32>
945 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
949 define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
950 ; CHECK-SD-LABEL: test_usdot_v16i8:
951 ; CHECK-SD: // %bb.0: // %entry
952 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
953 ; CHECK-SD-NEXT: ldr q1, [x0]
954 ; CHECK-SD-NEXT: ldr q2, [x1]
955 ; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
956 ; CHECK-SD-NEXT: addv s0, v0.4s
957 ; CHECK-SD-NEXT: fmov w8, s0
958 ; CHECK-SD-NEXT: add w0, w8, w2
961 ; CHECK-GI-LABEL: test_usdot_v16i8:
962 ; CHECK-GI: // %bb.0: // %entry
963 ; CHECK-GI-NEXT: ldr q0, [x0]
964 ; CHECK-GI-NEXT: ldr q1, [x1]
965 ; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
966 ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
967 ; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0
968 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
969 ; CHECK-GI-NEXT: ushll2 v4.4s, v2.8h, #0
970 ; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0
971 ; CHECK-GI-NEXT: sshll2 v6.4s, v3.8h, #0
972 ; CHECK-GI-NEXT: sshll2 v7.4s, v1.8h, #0
973 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
974 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
975 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
976 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
977 ; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s
978 ; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
979 ; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s
980 ; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s
981 ; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s
982 ; CHECK-GI-NEXT: addv s0, v0.4s
983 ; CHECK-GI-NEXT: fmov w8, s0
984 ; CHECK-GI-NEXT: add w0, w8, w2
987 %0 = load <16 x i8>, ptr %a
988 %1 = zext <16 x i8> %0 to <16 x i32>
989 %2 = load <16 x i8>, ptr %b
990 %3 = sext <16 x i8> %2 to <16 x i32>
991 %4 = mul nsw <16 x i32> %3, %1
992 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
993 %op.extra = add nsw i32 %5, %sum
997 define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
998 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8:
999 ; CHECK-SD: // %bb.0: // %entry
1000 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
1001 ; CHECK-SD-NEXT: ldr q1, [x0]
1002 ; CHECK-SD-NEXT: ldr q2, [x1]
1003 ; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v1.16b
1004 ; CHECK-SD-NEXT: addv s0, v0.4s
1005 ; CHECK-SD-NEXT: fmov w8, s0
1006 ; CHECK-SD-NEXT: add w0, w8, w2
1007 ; CHECK-SD-NEXT: ret
1009 ; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8:
1010 ; CHECK-GI: // %bb.0: // %entry
1011 ; CHECK-GI-NEXT: ldr q0, [x0]
1012 ; CHECK-GI-NEXT: ldr q1, [x1]
1013 ; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
1014 ; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
1015 ; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
1016 ; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
1017 ; CHECK-GI-NEXT: sshll2 v4.4s, v2.8h, #0
1018 ; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0
1019 ; CHECK-GI-NEXT: ushll2 v6.4s, v3.8h, #0
1020 ; CHECK-GI-NEXT: ushll2 v7.4s, v1.8h, #0
1021 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
1022 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
1023 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
1024 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
1025 ; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s
1026 ; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
1027 ; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s
1028 ; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s
1029 ; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s
1030 ; CHECK-GI-NEXT: addv s0, v0.4s
1031 ; CHECK-GI-NEXT: fmov w8, s0
1032 ; CHECK-GI-NEXT: add w0, w8, w2
1033 ; CHECK-GI-NEXT: ret
1035 %0 = load <16 x i8>, ptr %a
1036 %1 = sext <16 x i8> %0 to <16 x i32>
1037 %2 = load <16 x i8>, ptr %b
1038 %3 = zext <16 x i8> %2 to <16 x i32>
1039 %4 = mul nsw <16 x i32> %3, %1
1040 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
1041 %op.extra = add nsw i32 %5, %sum
1045 define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1046 ; CHECK-SD-LABEL: test_udot_v8i8_double:
1047 ; CHECK-SD: // %bb.0: // %entry
1048 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1049 ; CHECK-SD-NEXT: udot v4.2s, v2.8b, v3.8b
1050 ; CHECK-SD-NEXT: udot v4.2s, v0.8b, v1.8b
1051 ; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
1052 ; CHECK-SD-NEXT: fmov w0, s0
1053 ; CHECK-SD-NEXT: ret
1055 ; CHECK-GI-LABEL: test_udot_v8i8_double:
1056 ; CHECK-GI: // %bb.0: // %entry
1057 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1058 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
1059 ; CHECK-GI-NEXT: udot v5.2s, v0.8b, v1.8b
1060 ; CHECK-GI-NEXT: udot v4.2s, v2.8b, v3.8b
1061 ; CHECK-GI-NEXT: addp v0.2s, v5.2s, v5.2s
1062 ; CHECK-GI-NEXT: addp v1.2s, v4.2s, v4.2s
1063 ; CHECK-GI-NEXT: fmov w8, s0
1064 ; CHECK-GI-NEXT: fmov w9, s1
1065 ; CHECK-GI-NEXT: add w0, w8, w9
1066 ; CHECK-GI-NEXT: ret
1068 %az = zext <8 x i8> %a to <8 x i32>
1069 %bz = zext <8 x i8> %b to <8 x i32>
1070 %m1 = mul nuw nsw <8 x i32> %az, %bz
1071 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1072 %cz = zext <8 x i8> %c to <8 x i32>
1073 %dz = zext <8 x i8> %d to <8 x i32>
1074 %m2 = mul nuw nsw <8 x i32> %cz, %dz
1075 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1076 %x = add i32 %r1, %r2
1080 define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1081 ; CHECK-SD-LABEL: test_udot_v8i8_double_nomla:
1082 ; CHECK-SD: // %bb.0: // %entry
1083 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
1084 ; CHECK-SD-NEXT: movi v3.8b, #1
1085 ; CHECK-SD-NEXT: udot v1.2s, v2.8b, v3.8b
1086 ; CHECK-SD-NEXT: udot v1.2s, v0.8b, v3.8b
1087 ; CHECK-SD-NEXT: addp v0.2s, v1.2s, v1.2s
1088 ; CHECK-SD-NEXT: fmov w0, s0
1089 ; CHECK-SD-NEXT: ret
1091 ; CHECK-GI-LABEL: test_udot_v8i8_double_nomla:
1092 ; CHECK-GI: // %bb.0: // %entry
1093 ; CHECK-GI-NEXT: movi v1.8b, #1
1094 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1095 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1096 ; CHECK-GI-NEXT: udot v4.2s, v0.8b, v1.8b
1097 ; CHECK-GI-NEXT: udot v3.2s, v2.8b, v1.8b
1098 ; CHECK-GI-NEXT: addp v0.2s, v4.2s, v4.2s
1099 ; CHECK-GI-NEXT: addp v1.2s, v3.2s, v3.2s
1100 ; CHECK-GI-NEXT: fmov w8, s0
1101 ; CHECK-GI-NEXT: fmov w9, s1
1102 ; CHECK-GI-NEXT: add w0, w8, w9
1103 ; CHECK-GI-NEXT: ret
1105 %az = zext <8 x i8> %a to <8 x i32>
1106 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
1107 %cz = zext <8 x i8> %c to <8 x i32>
1108 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
1109 %x = add i32 %r1, %r2
1113 define i32 @test_udot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1114 ; CHECK-SD-LABEL: test_udot_v16i8_double:
1115 ; CHECK-SD: // %bb.0: // %entry
1116 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1117 ; CHECK-SD-NEXT: udot v4.4s, v2.16b, v3.16b
1118 ; CHECK-SD-NEXT: udot v4.4s, v0.16b, v1.16b
1119 ; CHECK-SD-NEXT: addv s0, v4.4s
1120 ; CHECK-SD-NEXT: fmov w0, s0
1121 ; CHECK-SD-NEXT: ret
1123 ; CHECK-GI-LABEL: test_udot_v16i8_double:
1124 ; CHECK-GI: // %bb.0: // %entry
1125 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1126 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
1127 ; CHECK-GI-NEXT: udot v5.4s, v0.16b, v1.16b
1128 ; CHECK-GI-NEXT: udot v4.4s, v2.16b, v3.16b
1129 ; CHECK-GI-NEXT: addv s0, v5.4s
1130 ; CHECK-GI-NEXT: addv s1, v4.4s
1131 ; CHECK-GI-NEXT: fmov w8, s0
1132 ; CHECK-GI-NEXT: fmov w9, s1
1133 ; CHECK-GI-NEXT: add w0, w8, w9
1134 ; CHECK-GI-NEXT: ret
1136 %az = zext <16 x i8> %a to <16 x i32>
1137 %bz = zext <16 x i8> %b to <16 x i32>
1138 %m1 = mul nuw nsw <16 x i32> %az, %bz
1139 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1140 %cz = zext <16 x i8> %c to <16 x i32>
1141 %dz = zext <16 x i8> %d to <16 x i32>
1142 %m2 = mul nuw nsw <16 x i32> %cz, %dz
1143 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1144 %x = add i32 %r1, %r2
1148 define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1149 ; CHECK-SD-LABEL: test_udot_v16i8_double_nomla:
1150 ; CHECK-SD: // %bb.0: // %entry
1151 ; CHECK-SD-NEXT: movi v1.16b, #1
1152 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
1153 ; CHECK-SD-NEXT: udot v3.4s, v2.16b, v1.16b
1154 ; CHECK-SD-NEXT: udot v3.4s, v0.16b, v1.16b
1155 ; CHECK-SD-NEXT: addv s0, v3.4s
1156 ; CHECK-SD-NEXT: fmov w0, s0
1157 ; CHECK-SD-NEXT: ret
1159 ; CHECK-GI-LABEL: test_udot_v16i8_double_nomla:
1160 ; CHECK-GI: // %bb.0: // %entry
1161 ; CHECK-GI-NEXT: movi v1.16b, #1
1162 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1163 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1164 ; CHECK-GI-NEXT: udot v4.4s, v0.16b, v1.16b
1165 ; CHECK-GI-NEXT: udot v3.4s, v2.16b, v1.16b
1166 ; CHECK-GI-NEXT: addv s0, v4.4s
1167 ; CHECK-GI-NEXT: addv s1, v3.4s
1168 ; CHECK-GI-NEXT: fmov w8, s0
1169 ; CHECK-GI-NEXT: fmov w9, s1
1170 ; CHECK-GI-NEXT: add w0, w8, w9
1171 ; CHECK-GI-NEXT: ret
1173 %az = zext <16 x i8> %a to <16 x i32>
1174 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
1175 %cz = zext <16 x i8> %c to <16 x i32>
1176 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
1177 %x = add i32 %r1, %r2
1181 define i32 @test_sdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1182 ; CHECK-SD-LABEL: test_sdot_v8i8_double:
1183 ; CHECK-SD: // %bb.0: // %entry
1184 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1185 ; CHECK-SD-NEXT: sdot v4.2s, v2.8b, v3.8b
1186 ; CHECK-SD-NEXT: sdot v4.2s, v0.8b, v1.8b
1187 ; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
1188 ; CHECK-SD-NEXT: fmov w0, s0
1189 ; CHECK-SD-NEXT: ret
1191 ; CHECK-GI-LABEL: test_sdot_v8i8_double:
1192 ; CHECK-GI: // %bb.0: // %entry
1193 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1194 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
1195 ; CHECK-GI-NEXT: sdot v5.2s, v0.8b, v1.8b
1196 ; CHECK-GI-NEXT: sdot v4.2s, v2.8b, v3.8b
1197 ; CHECK-GI-NEXT: addp v0.2s, v5.2s, v5.2s
1198 ; CHECK-GI-NEXT: addp v1.2s, v4.2s, v4.2s
1199 ; CHECK-GI-NEXT: fmov w8, s0
1200 ; CHECK-GI-NEXT: fmov w9, s1
1201 ; CHECK-GI-NEXT: add w0, w8, w9
1202 ; CHECK-GI-NEXT: ret
1204 %az = sext <8 x i8> %a to <8 x i32>
1205 %bz = sext <8 x i8> %b to <8 x i32>
1206 %m1 = mul nuw nsw <8 x i32> %az, %bz
1207 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1208 %cz = sext <8 x i8> %c to <8 x i32>
1209 %dz = sext <8 x i8> %d to <8 x i32>
1210 %m2 = mul nuw nsw <8 x i32> %cz, %dz
1211 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1212 %x = add i32 %r1, %r2
1216 define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1217 ; CHECK-SD-LABEL: test_sdot_v8i8_double_nomla:
1218 ; CHECK-SD: // %bb.0: // %entry
1219 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
1220 ; CHECK-SD-NEXT: movi v3.8b, #1
1221 ; CHECK-SD-NEXT: sdot v1.2s, v2.8b, v3.8b
1222 ; CHECK-SD-NEXT: sdot v1.2s, v0.8b, v3.8b
1223 ; CHECK-SD-NEXT: addp v0.2s, v1.2s, v1.2s
1224 ; CHECK-SD-NEXT: fmov w0, s0
1225 ; CHECK-SD-NEXT: ret
1227 ; CHECK-GI-LABEL: test_sdot_v8i8_double_nomla:
1228 ; CHECK-GI: // %bb.0: // %entry
1229 ; CHECK-GI-NEXT: movi v1.8b, #1
1230 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1231 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1232 ; CHECK-GI-NEXT: sdot v4.2s, v0.8b, v1.8b
1233 ; CHECK-GI-NEXT: sdot v3.2s, v2.8b, v1.8b
1234 ; CHECK-GI-NEXT: addp v0.2s, v4.2s, v4.2s
1235 ; CHECK-GI-NEXT: addp v1.2s, v3.2s, v3.2s
1236 ; CHECK-GI-NEXT: fmov w8, s0
1237 ; CHECK-GI-NEXT: fmov w9, s1
1238 ; CHECK-GI-NEXT: add w0, w8, w9
1239 ; CHECK-GI-NEXT: ret
1241 %az = sext <8 x i8> %a to <8 x i32>
1242 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
1243 %cz = sext <8 x i8> %c to <8 x i32>
1244 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
1245 %x = add i32 %r1, %r2
1249 define i32 @test_sdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1250 ; CHECK-SD-LABEL: test_sdot_v16i8_double:
1251 ; CHECK-SD: // %bb.0: // %entry
1252 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1253 ; CHECK-SD-NEXT: sdot v4.4s, v2.16b, v3.16b
1254 ; CHECK-SD-NEXT: sdot v4.4s, v0.16b, v1.16b
1255 ; CHECK-SD-NEXT: addv s0, v4.4s
1256 ; CHECK-SD-NEXT: fmov w0, s0
1257 ; CHECK-SD-NEXT: ret
1259 ; CHECK-GI-LABEL: test_sdot_v16i8_double:
1260 ; CHECK-GI: // %bb.0: // %entry
1261 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1262 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
1263 ; CHECK-GI-NEXT: sdot v5.4s, v0.16b, v1.16b
1264 ; CHECK-GI-NEXT: sdot v4.4s, v2.16b, v3.16b
1265 ; CHECK-GI-NEXT: addv s0, v5.4s
1266 ; CHECK-GI-NEXT: addv s1, v4.4s
1267 ; CHECK-GI-NEXT: fmov w8, s0
1268 ; CHECK-GI-NEXT: fmov w9, s1
1269 ; CHECK-GI-NEXT: add w0, w8, w9
1270 ; CHECK-GI-NEXT: ret
1272 %az = sext <16 x i8> %a to <16 x i32>
1273 %bz = sext <16 x i8> %b to <16 x i32>
1274 %m1 = mul nuw nsw <16 x i32> %az, %bz
1275 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1276 %cz = sext <16 x i8> %c to <16 x i32>
1277 %dz = sext <16 x i8> %d to <16 x i32>
1278 %m2 = mul nuw nsw <16 x i32> %cz, %dz
1279 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1280 %x = add i32 %r1, %r2
1284 define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1285 ; CHECK-SD-LABEL: test_sdot_v16i8_double_nomla:
1286 ; CHECK-SD: // %bb.0: // %entry
1287 ; CHECK-SD-NEXT: movi v1.16b, #1
1288 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
1289 ; CHECK-SD-NEXT: sdot v3.4s, v2.16b, v1.16b
1290 ; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b
1291 ; CHECK-SD-NEXT: addv s0, v3.4s
1292 ; CHECK-SD-NEXT: fmov w0, s0
1293 ; CHECK-SD-NEXT: ret
1295 ; CHECK-GI-LABEL: test_sdot_v16i8_double_nomla:
1296 ; CHECK-GI: // %bb.0: // %entry
1297 ; CHECK-GI-NEXT: movi v1.16b, #1
1298 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1299 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
1300 ; CHECK-GI-NEXT: sdot v4.4s, v0.16b, v1.16b
1301 ; CHECK-GI-NEXT: sdot v3.4s, v2.16b, v1.16b
1302 ; CHECK-GI-NEXT: addv s0, v4.4s
1303 ; CHECK-GI-NEXT: addv s1, v3.4s
1304 ; CHECK-GI-NEXT: fmov w8, s0
1305 ; CHECK-GI-NEXT: fmov w9, s1
1306 ; CHECK-GI-NEXT: add w0, w8, w9
1307 ; CHECK-GI-NEXT: ret
1309 %az = sext <16 x i8> %a to <16 x i32>
1310 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
1311 %cz = sext <16 x i8> %c to <16 x i32>
1312 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
1313 %x = add i32 %r1, %r2
1318 define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1319 ; CHECK-SD-LABEL: test_usdot_v8i8_double:
1320 ; CHECK-SD: // %bb.0: // %entry
1321 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1322 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1323 ; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b
1324 ; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b
1325 ; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
1326 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
1327 ; CHECK-SD-NEXT: fmov w0, s0
1328 ; CHECK-SD-NEXT: ret
1330 ; CHECK-GI-LABEL: test_usdot_v8i8_double:
1331 ; CHECK-GI: // %bb.0: // %entry
1332 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
1333 ; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
1334 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
1335 ; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0
1336 ; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0
1337 ; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0
1338 ; CHECK-GI-NEXT: ushll2 v6.4s, v2.8h, #0
1339 ; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0
1340 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
1341 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
1342 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
1343 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
1344 ; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s
1345 ; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s
1346 ; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s
1347 ; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s
1348 ; CHECK-GI-NEXT: addv s0, v4.4s
1349 ; CHECK-GI-NEXT: addv s1, v5.4s
1350 ; CHECK-GI-NEXT: fmov w8, s0
1351 ; CHECK-GI-NEXT: fmov w9, s1
1352 ; CHECK-GI-NEXT: add w0, w8, w9
1353 ; CHECK-GI-NEXT: ret
1355 %az = zext <8 x i8> %a to <8 x i32>
1356 %bz = sext <8 x i8> %b to <8 x i32>
1357 %m1 = mul nuw nsw <8 x i32> %az, %bz
1358 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1359 %cz = zext <8 x i8> %c to <8 x i32>
1360 %dz = sext <8 x i8> %d to <8 x i32>
1361 %m2 = mul nuw nsw <8 x i32> %cz, %dz
1362 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1363 %x = add i32 %r1, %r2
1367 define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1368 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
1369 ; CHECK-SD: // %bb.0: // %entry
1370 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1371 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1372 ; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b
1373 ; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b
1374 ; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
1375 ; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
1376 ; CHECK-SD-NEXT: fmov w0, s0
1377 ; CHECK-SD-NEXT: ret
1379 ; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double:
1380 ; CHECK-GI: // %bb.0: // %entry
1381 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
1382 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
1383 ; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0
1384 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0
1385 ; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0
1386 ; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0
1387 ; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0
1388 ; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0
1389 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
1390 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
1391 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
1392 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
1393 ; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s
1394 ; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s
1395 ; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s
1396 ; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s
1397 ; CHECK-GI-NEXT: addv s0, v4.4s
1398 ; CHECK-GI-NEXT: addv s1, v5.4s
1399 ; CHECK-GI-NEXT: fmov w8, s0
1400 ; CHECK-GI-NEXT: fmov w9, s1
1401 ; CHECK-GI-NEXT: add w0, w8, w9
1402 ; CHECK-GI-NEXT: ret
1404 %az = sext <8 x i8> %a to <8 x i32>
1405 %bz = zext <8 x i8> %b to <8 x i32>
1406 %m1 = mul nuw nsw <8 x i32> %az, %bz
1407 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1408 %cz = sext <8 x i8> %c to <8 x i32>
1409 %dz = zext <8 x i8> %d to <8 x i32>
1410 %m2 = mul nuw nsw <8 x i32> %cz, %dz
1411 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1412 %x = add i32 %r1, %r2
1416 define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1417 ; CHECK-SD-LABEL: test_usdot_v16i8_double:
1418 ; CHECK-SD: // %bb.0: // %entry
1419 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1420 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1421 ; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b
1422 ; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b
1423 ; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
1424 ; CHECK-SD-NEXT: addv s0, v0.4s
1425 ; CHECK-SD-NEXT: fmov w0, s0
1426 ; CHECK-SD-NEXT: ret
1428 ; CHECK-GI-LABEL: test_usdot_v16i8_double:
1429 ; CHECK-GI: // %bb.0: // %entry
1430 ; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0
1431 ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
1432 ; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0
1433 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
1434 ; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0
1435 ; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0
1436 ; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0
1437 ; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
1438 ; CHECK-GI-NEXT: ushll2 v16.4s, v4.8h, #0
1439 ; CHECK-GI-NEXT: ushll2 v17.4s, v0.8h, #0
1440 ; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0
1441 ; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0
1442 ; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0
1443 ; CHECK-GI-NEXT: sshll2 v21.4s, v7.8h, #0
1444 ; CHECK-GI-NEXT: ushll2 v22.4s, v2.8h, #0
1445 ; CHECK-GI-NEXT: sshll2 v23.4s, v3.8h, #0
1446 ; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
1447 ; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s
1448 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
1449 ; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0
1450 ; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s
1451 ; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s
1452 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
1453 ; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s
1454 ; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0
1455 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
1456 ; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
1457 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
1458 ; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s
1459 ; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s
1460 ; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s
1461 ; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s
1462 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
1463 ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
1464 ; CHECK-GI-NEXT: addv s0, v0.4s
1465 ; CHECK-GI-NEXT: addv s1, v1.4s
1466 ; CHECK-GI-NEXT: fmov w8, s0
1467 ; CHECK-GI-NEXT: fmov w9, s1
1468 ; CHECK-GI-NEXT: add w0, w8, w9
1469 ; CHECK-GI-NEXT: ret
1471 %az = zext <16 x i8> %a to <16 x i32>
1472 %bz = sext <16 x i8> %b to <16 x i32>
1473 %m1 = mul nuw nsw <16 x i32> %az, %bz
1474 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1475 %cz = zext <16 x i8> %c to <16 x i32>
1476 %dz = sext <16 x i8> %d to <16 x i32>
1477 %m2 = mul nuw nsw <16 x i32> %cz, %dz
1478 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1479 %x = add i32 %r1, %r2
1484 define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1485 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
1486 ; CHECK-SD: // %bb.0: // %entry
1487 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
1488 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
1489 ; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b
1490 ; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b
1491 ; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
1492 ; CHECK-SD-NEXT: addv s0, v0.4s
1493 ; CHECK-SD-NEXT: fmov w0, s0
1494 ; CHECK-SD-NEXT: ret
1496 ; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double:
1497 ; CHECK-GI: // %bb.0: // %entry
1498 ; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0
1499 ; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
1500 ; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0
1501 ; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
1502 ; CHECK-GI-NEXT: sshll v6.8h, v2.8b, #0
1503 ; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0
1504 ; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
1505 ; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0
1506 ; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0
1507 ; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0
1508 ; CHECK-GI-NEXT: ushll2 v18.4s, v5.8h, #0
1509 ; CHECK-GI-NEXT: ushll2 v19.4s, v1.8h, #0
1510 ; CHECK-GI-NEXT: sshll2 v20.4s, v6.8h, #0
1511 ; CHECK-GI-NEXT: ushll2 v21.4s, v7.8h, #0
1512 ; CHECK-GI-NEXT: sshll2 v22.4s, v2.8h, #0
1513 ; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0
1514 ; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
1515 ; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s
1516 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
1517 ; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
1518 ; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s
1519 ; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s
1520 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
1521 ; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s
1522 ; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
1523 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
1524 ; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0
1525 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
1526 ; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s
1527 ; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s
1528 ; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s
1529 ; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s
1530 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
1531 ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
1532 ; CHECK-GI-NEXT: addv s0, v0.4s
1533 ; CHECK-GI-NEXT: addv s1, v1.4s
1534 ; CHECK-GI-NEXT: fmov w8, s0
1535 ; CHECK-GI-NEXT: fmov w9, s1
1536 ; CHECK-GI-NEXT: add w0, w8, w9
1537 ; CHECK-GI-NEXT: ret
1539 %az = sext <16 x i8> %a to <16 x i32>
1540 %bz = zext <16 x i8> %b to <16 x i32>
1541 %m1 = mul nuw nsw <16 x i32> %az, %bz
1542 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1543 %cz = sext <16 x i8> %c to <16 x i32>
1544 %dz = zext <16 x i8> %d to <16 x i32>
1545 %m2 = mul nuw nsw <16 x i32> %cz, %dz
1546 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1547 %x = add i32 %r1, %r2
1551 define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1552 ; CHECK-SD-LABEL: test_udot_v24i8:
1553 ; CHECK-SD: // %bb.0: // %entry
1554 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
1555 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
1556 ; CHECK-SD-NEXT: ldr q2, [x0]
1557 ; CHECK-SD-NEXT: ldr q3, [x1]
1558 ; CHECK-SD-NEXT: ldr d4, [x0, #16]
1559 ; CHECK-SD-NEXT: ldr d5, [x1, #16]
1560 ; CHECK-SD-NEXT: udot v1.2s, v5.8b, v4.8b
1561 ; CHECK-SD-NEXT: udot v0.4s, v3.16b, v2.16b
1562 ; CHECK-SD-NEXT: addp v1.2s, v1.2s, v1.2s
1563 ; CHECK-SD-NEXT: addv s0, v0.4s
1564 ; CHECK-SD-NEXT: fmov w8, s1
1565 ; CHECK-SD-NEXT: fmov w9, s0
1566 ; CHECK-SD-NEXT: add w8, w9, w8
1567 ; CHECK-SD-NEXT: add w0, w8, w2
1568 ; CHECK-SD-NEXT: ret
1570 ; CHECK-GI-LABEL: test_udot_v24i8:
1571 ; CHECK-GI: // %bb.0: // %entry
1572 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
1573 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
1574 ; CHECK-GI-NEXT: ldr q2, [x0]
1575 ; CHECK-GI-NEXT: ldr d3, [x0, #16]
1576 ; CHECK-GI-NEXT: ldr q4, [x1]
1577 ; CHECK-GI-NEXT: ldr d5, [x1, #16]
1578 ; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b
1579 ; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b
1580 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
1581 ; CHECK-GI-NEXT: addv s0, v0.4s
1582 ; CHECK-GI-NEXT: fmov w8, s0
1583 ; CHECK-GI-NEXT: add w0, w8, w2
1584 ; CHECK-GI-NEXT: ret
1586 %0 = load <24 x i8>, ptr %a
1587 %1 = zext <24 x i8> %0 to <24 x i32>
1588 %2 = load <24 x i8>, ptr %b
1589 %3 = zext <24 x i8> %2 to <24 x i32>
1590 %4 = mul nuw nsw <24 x i32> %3, %1
1591 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
1592 %op.extra = add i32 %5, %sum
1596 define i32 @test_udot_v24i8_nomla(ptr nocapture readonly %a1) {
1597 ; CHECK-SD-LABEL: test_udot_v24i8_nomla:
1598 ; CHECK-SD: // %bb.0: // %entry
1599 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
1600 ; CHECK-SD-NEXT: movi v1.8b, #1
1601 ; CHECK-SD-NEXT: ldr q4, [x0]
1602 ; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
1603 ; CHECK-SD-NEXT: movi v3.16b, #1
1604 ; CHECK-SD-NEXT: ldr d5, [x0, #16]
1605 ; CHECK-SD-NEXT: udot v2.2s, v5.8b, v1.8b
1606 ; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b
1607 ; CHECK-SD-NEXT: addp v1.2s, v2.2s, v2.2s
1608 ; CHECK-SD-NEXT: addv s0, v0.4s
1609 ; CHECK-SD-NEXT: fmov w8, s1
1610 ; CHECK-SD-NEXT: fmov w9, s0
1611 ; CHECK-SD-NEXT: add w0, w9, w8
1612 ; CHECK-SD-NEXT: ret
1614 ; CHECK-GI-LABEL: test_udot_v24i8_nomla:
1615 ; CHECK-GI: // %bb.0: // %entry
1616 ; CHECK-GI-NEXT: movi v0.8b, #1
1617 ; CHECK-GI-NEXT: movi v1.8b, #1
1618 ; CHECK-GI-NEXT: ldr q4, [x0]
1619 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
1620 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1621 ; CHECK-GI-NEXT: ldr d5, [x0, #16]
1622 ; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
1623 ; CHECK-GI-NEXT: udot v2.4s, v5.16b, v0.16b
1624 ; CHECK-GI-NEXT: udot v3.4s, v4.16b, v1.16b
1625 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v2.4s
1626 ; CHECK-GI-NEXT: addv s0, v0.4s
1627 ; CHECK-GI-NEXT: fmov w0, s0
1628 ; CHECK-GI-NEXT: ret
1630 %0 = load <24 x i8>, ptr %a1
1631 %1 = zext <24 x i8> %0 to <24 x i32>
1632 %2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %1)
1635 define i32 @test_sdot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1636 ; CHECK-SD-LABEL: test_sdot_v24i8:
1637 ; CHECK-SD: // %bb.0: // %entry
1638 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
1639 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
1640 ; CHECK-SD-NEXT: ldr q2, [x0]
1641 ; CHECK-SD-NEXT: ldr q3, [x1]
1642 ; CHECK-SD-NEXT: ldr d4, [x0, #16]
1643 ; CHECK-SD-NEXT: ldr d5, [x1, #16]
1644 ; CHECK-SD-NEXT: sdot v1.2s, v5.8b, v4.8b
1645 ; CHECK-SD-NEXT: sdot v0.4s, v3.16b, v2.16b
1646 ; CHECK-SD-NEXT: addp v1.2s, v1.2s, v1.2s
1647 ; CHECK-SD-NEXT: addv s0, v0.4s
1648 ; CHECK-SD-NEXT: fmov w8, s1
1649 ; CHECK-SD-NEXT: fmov w9, s0
1650 ; CHECK-SD-NEXT: add w8, w9, w8
1651 ; CHECK-SD-NEXT: add w0, w8, w2
1652 ; CHECK-SD-NEXT: ret
1654 ; CHECK-GI-LABEL: test_sdot_v24i8:
1655 ; CHECK-GI: // %bb.0: // %entry
1656 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
1657 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
1658 ; CHECK-GI-NEXT: ldr q2, [x0]
1659 ; CHECK-GI-NEXT: ldr d3, [x0, #16]
1660 ; CHECK-GI-NEXT: ldr q4, [x1]
1661 ; CHECK-GI-NEXT: ldr d5, [x1, #16]
1662 ; CHECK-GI-NEXT: sdot v1.4s, v4.16b, v2.16b
1663 ; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b
1664 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
1665 ; CHECK-GI-NEXT: addv s0, v0.4s
1666 ; CHECK-GI-NEXT: fmov w8, s0
1667 ; CHECK-GI-NEXT: add w0, w8, w2
1668 ; CHECK-GI-NEXT: ret
1670 %0 = load <24 x i8>, ptr %a
1671 %1 = sext <24 x i8> %0 to <24 x i32>
1672 %2 = load <24 x i8>, ptr %b
1673 %3 = sext <24 x i8> %2 to <24 x i32>
1674 %4 = mul nsw <24 x i32> %3, %1
1675 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
1676 %op.extra = add nsw i32 %5, %sum
1680 define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
1681 ; CHECK-SD-LABEL: test_sdot_v24i8_double:
1682 ; CHECK-SD: // %bb.0: // %entry
1683 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1684 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
1685 ; CHECK-SD-NEXT: .cfi_offset w29, -16
1686 ; CHECK-SD-NEXT: fmov s0, w0
1687 ; CHECK-SD-NEXT: ldr b1, [sp, #144]
1688 ; CHECK-SD-NEXT: add x10, sp, #152
1689 ; CHECK-SD-NEXT: add x9, sp, #160
1690 ; CHECK-SD-NEXT: add x8, sp, #168
1691 ; CHECK-SD-NEXT: ldr b2, [sp, #272]
1692 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x10]
1693 ; CHECK-SD-NEXT: add x11, sp, #280
1694 ; CHECK-SD-NEXT: ldr b3, [sp, #80]
1695 ; CHECK-SD-NEXT: mov v0.b[1], w1
1696 ; CHECK-SD-NEXT: ldr b4, [sp, #528]
1697 ; CHECK-SD-NEXT: add x10, sp, #88
1698 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x11]
1699 ; CHECK-SD-NEXT: add x11, sp, #536
1700 ; CHECK-SD-NEXT: ldr b5, [sp, #336]
1701 ; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
1702 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x10]
1703 ; CHECK-SD-NEXT: add x10, sp, #344
1704 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x11]
1705 ; CHECK-SD-NEXT: add x11, sp, #176
1706 ; CHECK-SD-NEXT: ldr b6, [sp, #656]
1707 ; CHECK-SD-NEXT: mov v0.b[2], w2
1708 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x10]
1709 ; CHECK-SD-NEXT: ldr b7, [sp, #464]
1710 ; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
1711 ; CHECK-SD-NEXT: add x12, sp, #664
1712 ; CHECK-SD-NEXT: add x9, sp, #472
1713 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12]
1714 ; CHECK-SD-NEXT: add x8, sp, #96
1715 ; CHECK-SD-NEXT: add x10, sp, #184
1716 ; CHECK-SD-NEXT: add x12, sp, #288
1717 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x9]
1718 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8]
1719 ; CHECK-SD-NEXT: mov v0.b[3], w3
1720 ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x11]
1721 ; CHECK-SD-NEXT: add x8, sp, #352
1722 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x12]
1723 ; CHECK-SD-NEXT: add x13, sp, #544
1724 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
1725 ; CHECK-SD-NEXT: add x8, sp, #672
1726 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x13]
1727 ; CHECK-SD-NEXT: add x9, sp, #192
1728 ; CHECK-SD-NEXT: ld1 { v1.b }[5], [x10]
1729 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
1730 ; CHECK-SD-NEXT: add x8, sp, #480
1731 ; CHECK-SD-NEXT: mov v0.b[4], w4
1732 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8]
1733 ; CHECK-SD-NEXT: add x8, sp, #296
1734 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
1735 ; CHECK-SD-NEXT: add x8, sp, #552
1736 ; CHECK-SD-NEXT: add x12, sp, #200
1737 ; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9]
1738 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8]
1739 ; CHECK-SD-NEXT: add x8, sp, #360
1740 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8]
1741 ; CHECK-SD-NEXT: add x8, sp, #104
1742 ; CHECK-SD-NEXT: add x9, sp, #560
1743 ; CHECK-SD-NEXT: mov v0.b[5], w5
1744 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8]
1745 ; CHECK-SD-NEXT: add x8, sp, #368
1746 ; CHECK-SD-NEXT: ld1 { v1.b }[7], [x12]
1747 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x9]
1748 ; CHECK-SD-NEXT: add x13, sp, #208
1749 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8]
1750 ; CHECK-SD-NEXT: add x12, sp, #304
1751 ; CHECK-SD-NEXT: add x8, sp, #568
1752 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
1753 ; CHECK-SD-NEXT: add x12, sp, #16
1754 ; CHECK-SD-NEXT: add x17, sp, #376
1755 ; CHECK-SD-NEXT: mov v0.b[6], w6
1756 ; CHECK-SD-NEXT: ld1 { v1.b }[8], [x13]
1757 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
1758 ; CHECK-SD-NEXT: add x14, sp, #216
1759 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x17]
1760 ; CHECK-SD-NEXT: add x13, sp, #576
1761 ; CHECK-SD-NEXT: add x11, sp, #224
1762 ; CHECK-SD-NEXT: add x10, sp, #232
1763 ; CHECK-SD-NEXT: add x15, sp, #240
1764 ; CHECK-SD-NEXT: ld1 { v1.b }[9], [x14]
1765 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13]
1766 ; CHECK-SD-NEXT: add x13, sp, #384
1767 ; CHECK-SD-NEXT: mov v0.b[7], w7
1768 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x13]
1769 ; CHECK-SD-NEXT: add x13, sp, #112
1770 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x13]
1771 ; CHECK-SD-NEXT: add x13, sp, #32
1772 ; CHECK-SD-NEXT: add x14, sp, #584
1773 ; CHECK-SD-NEXT: ld1 { v1.b }[10], [x11]
1774 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x14]
1775 ; CHECK-SD-NEXT: add x11, sp, #312
1776 ; CHECK-SD-NEXT: add x14, sp, #40
1777 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x11]
1778 ; CHECK-SD-NEXT: add x11, sp, #592
1779 ; CHECK-SD-NEXT: ld1 { v0.b }[8], [x12]
1780 ; CHECK-SD-NEXT: add x12, sp, #24
1781 ; CHECK-SD-NEXT: add x16, sp, #248
1782 ; CHECK-SD-NEXT: ld1 { v1.b }[11], [x10]
1783 ; CHECK-SD-NEXT: ld1 { v4.b }[8], [x11]
1784 ; CHECK-SD-NEXT: add x11, sp, #400
1785 ; CHECK-SD-NEXT: add x9, sp, #256
1786 ; CHECK-SD-NEXT: add x8, sp, #264
1787 ; CHECK-SD-NEXT: add x10, sp, #72
1788 ; CHECK-SD-NEXT: ld1 { v0.b }[9], [x12]
1789 ; CHECK-SD-NEXT: add x12, sp, #392
1790 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
1791 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x12]
1792 ; CHECK-SD-NEXT: add x12, sp, #48
1793 ; CHECK-SD-NEXT: ld1 { v1.b }[12], [x15]
1794 ; CHECK-SD-NEXT: add x15, sp, #120
1795 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
1796 ; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
1797 ; CHECK-SD-NEXT: ld1 { v0.b }[10], [x13]
1798 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x15]
1799 ; CHECK-SD-NEXT: add x15, sp, #408
1800 ; CHECK-SD-NEXT: ld1 { v5.b }[8], [x11]
1801 ; CHECK-SD-NEXT: add x13, sp, #56
1802 ; CHECK-SD-NEXT: ld1 { v1.b }[13], [x16]
1803 ; CHECK-SD-NEXT: add x11, sp, #64
1804 ; CHECK-SD-NEXT: add x16, sp, #616
1805 ; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
1806 ; CHECK-SD-NEXT: ld1 { v0.b }[11], [x14]
1807 ; CHECK-SD-NEXT: add x14, sp, #600
1808 ; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14]
1809 ; CHECK-SD-NEXT: ld1 { v5.b }[9], [x15]
1810 ; CHECK-SD-NEXT: add x15, sp, #608
1811 ; CHECK-SD-NEXT: ld1 { v1.b }[14], [x9]
1812 ; CHECK-SD-NEXT: add x9, sp, #488
1813 ; CHECK-SD-NEXT: add x14, sp, #320
1814 ; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12]
1815 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
1816 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x14]
1817 ; CHECK-SD-NEXT: ld1 { v4.b }[10], [x15]
1818 ; CHECK-SD-NEXT: add x14, sp, #624
1819 ; CHECK-SD-NEXT: add x9, sp, #688
1820 ; CHECK-SD-NEXT: ld1 { v1.b }[15], [x8]
1821 ; CHECK-SD-NEXT: add x8, sp, #432
1822 ; CHECK-SD-NEXT: add x12, sp, #328
1823 ; CHECK-SD-NEXT: ld1 { v0.b }[13], [x13]
1824 ; CHECK-SD-NEXT: add x13, sp, #416
1825 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x12]
1826 ; CHECK-SD-NEXT: ld1 { v5.b }[10], [x13]
1827 ; CHECK-SD-NEXT: ld1 { v4.b }[11], [x16]
1828 ; CHECK-SD-NEXT: add x16, sp, #680
1829 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x16]
1830 ; CHECK-SD-NEXT: add x13, sp, #632
1831 ; CHECK-SD-NEXT: add x12, sp, #504
1832 ; CHECK-SD-NEXT: ld1 { v0.b }[14], [x11]
1833 ; CHECK-SD-NEXT: add x11, sp, #424
1834 ; CHECK-SD-NEXT: add x15, sp, #128
1835 ; CHECK-SD-NEXT: ld1 { v5.b }[11], [x11]
1836 ; CHECK-SD-NEXT: ld1 { v4.b }[12], [x14]
1837 ; CHECK-SD-NEXT: add x11, sp, #696
1838 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x9]
1839 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x15]
1840 ; CHECK-SD-NEXT: add x9, sp, #640
1841 ; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10]
1842 ; CHECK-SD-NEXT: add x10, sp, #496
1843 ; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8]
1844 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
1845 ; CHECK-SD-NEXT: ld1 { v4.b }[13], [x13]
1846 ; CHECK-SD-NEXT: add x10, sp, #440
1847 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x11]
1848 ; CHECK-SD-NEXT: add x11, sp, #512
1849 ; CHECK-SD-NEXT: add x8, sp, #136
1850 ; CHECK-SD-NEXT: sdot v17.4s, v0.16b, v1.16b
1851 ; CHECK-SD-NEXT: ld1 { v5.b }[13], [x10]
1852 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x12]
1853 ; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9]
1854 ; CHECK-SD-NEXT: add x9, sp, #448
1855 ; CHECK-SD-NEXT: add x10, sp, #704
1856 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8]
1857 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x10]
1858 ; CHECK-SD-NEXT: add x8, sp, #648
1859 ; CHECK-SD-NEXT: add x10, sp, #520
1860 ; CHECK-SD-NEXT: ld1 { v5.b }[14], [x9]
1861 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x11]
1862 ; CHECK-SD-NEXT: ld1 { v4.b }[15], [x8]
1863 ; CHECK-SD-NEXT: add x8, sp, #456
1864 ; CHECK-SD-NEXT: add x9, sp, #712
1865 ; CHECK-SD-NEXT: sdot v19.2s, v3.8b, v2.8b
1866 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x9]
1867 ; CHECK-SD-NEXT: addv s0, v17.4s
1868 ; CHECK-SD-NEXT: ld1 { v5.b }[15], [x8]
1869 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x10]
1870 ; CHECK-SD-NEXT: addp v1.2s, v19.2s, v19.2s
1871 ; CHECK-SD-NEXT: fmov w8, s0
1872 ; CHECK-SD-NEXT: sdot v16.4s, v5.16b, v4.16b
1873 ; CHECK-SD-NEXT: sdot v18.2s, v7.8b, v6.8b
1874 ; CHECK-SD-NEXT: fmov w9, s1
1875 ; CHECK-SD-NEXT: addv s2, v16.4s
1876 ; CHECK-SD-NEXT: addp v3.2s, v18.2s, v18.2s
1877 ; CHECK-SD-NEXT: add w8, w8, w9
1878 ; CHECK-SD-NEXT: fmov w10, s2
1879 ; CHECK-SD-NEXT: fmov w11, s3
1880 ; CHECK-SD-NEXT: add w9, w10, w11
1881 ; CHECK-SD-NEXT: add w0, w8, w9
1882 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1883 ; CHECK-SD-NEXT: ret
1885 ; CHECK-GI-LABEL: test_sdot_v24i8_double:
1886 ; CHECK-GI: // %bb.0: // %entry
1887 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1888 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
1889 ; CHECK-GI-NEXT: .cfi_offset w29, -16
1890 ; CHECK-GI-NEXT: ldr w8, [sp, #80]
1891 ; CHECK-GI-NEXT: ldr w9, [sp, #88]
1892 ; CHECK-GI-NEXT: fmov s1, w0
1893 ; CHECK-GI-NEXT: ldr w11, [sp, #336]
1894 ; CHECK-GI-NEXT: ldr w10, [sp, #280]
1895 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
1896 ; CHECK-GI-NEXT: fmov s0, w8
1897 ; CHECK-GI-NEXT: ldr w8, [sp, #96]
1898 ; CHECK-GI-NEXT: ldr w12, [sp, #152]
1899 ; CHECK-GI-NEXT: mov v1.b[1], w1
1900 ; CHECK-GI-NEXT: fmov s4, w11
1901 ; CHECK-GI-NEXT: ldr w11, [sp, #584]
1902 ; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
1903 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
1904 ; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
1905 ; CHECK-GI-NEXT: mov v0.b[1], w9
1906 ; CHECK-GI-NEXT: ldr w9, [sp, #272]
1907 ; CHECK-GI-NEXT: fmov s2, w9
1908 ; CHECK-GI-NEXT: ldr w9, [sp, #144]
1909 ; CHECK-GI-NEXT: mov v1.b[2], w2
1910 ; CHECK-GI-NEXT: mov v0.b[2], w8
1911 ; CHECK-GI-NEXT: ldr w8, [sp, #528]
1912 ; CHECK-GI-NEXT: fmov s3, w9
1913 ; CHECK-GI-NEXT: mov v2.b[1], w10
1914 ; CHECK-GI-NEXT: ldr w9, [sp, #344]
1915 ; CHECK-GI-NEXT: ldr w10, [sp, #536]
1916 ; CHECK-GI-NEXT: fmov s5, w8
1917 ; CHECK-GI-NEXT: ldr w8, [sp, #288]
1918 ; CHECK-GI-NEXT: mov v1.b[3], w3
1919 ; CHECK-GI-NEXT: mov v3.b[1], w12
1920 ; CHECK-GI-NEXT: mov v4.b[1], w9
1921 ; CHECK-GI-NEXT: ldr w9, [sp, #160]
1922 ; CHECK-GI-NEXT: mov v5.b[1], w10
1923 ; CHECK-GI-NEXT: mov v2.b[2], w8
1924 ; CHECK-GI-NEXT: ldr w8, [sp, #104]
1925 ; CHECK-GI-NEXT: ldr w10, [sp, #352]
1926 ; CHECK-GI-NEXT: mov v1.b[4], w4
1927 ; CHECK-GI-NEXT: mov v3.b[2], w9
1928 ; CHECK-GI-NEXT: ldr w9, [sp, #544]
1929 ; CHECK-GI-NEXT: mov v0.b[3], w8
1930 ; CHECK-GI-NEXT: ldr w8, [sp, #296]
1931 ; CHECK-GI-NEXT: mov v4.b[2], w10
1932 ; CHECK-GI-NEXT: ldr w10, [sp, #360]
1933 ; CHECK-GI-NEXT: mov v5.b[2], w9
1934 ; CHECK-GI-NEXT: ldr w9, [sp, #168]
1935 ; CHECK-GI-NEXT: mov v2.b[3], w8
1936 ; CHECK-GI-NEXT: ldr w8, [sp, #112]
1937 ; CHECK-GI-NEXT: mov v1.b[5], w5
1938 ; CHECK-GI-NEXT: mov v3.b[3], w9
1939 ; CHECK-GI-NEXT: ldr w9, [sp, #552]
1940 ; CHECK-GI-NEXT: mov v0.b[4], w8
1941 ; CHECK-GI-NEXT: ldr w8, [sp, #304]
1942 ; CHECK-GI-NEXT: mov v4.b[3], w10
1943 ; CHECK-GI-NEXT: mov v5.b[3], w9
1944 ; CHECK-GI-NEXT: ldr w9, [sp, #176]
1945 ; CHECK-GI-NEXT: ldr w10, [sp, #368]
1946 ; CHECK-GI-NEXT: mov v2.b[4], w8
1947 ; CHECK-GI-NEXT: ldr w8, [sp, #120]
1948 ; CHECK-GI-NEXT: mov v1.b[6], w6
1949 ; CHECK-GI-NEXT: mov v3.b[4], w9
1950 ; CHECK-GI-NEXT: ldr w9, [sp, #560]
1951 ; CHECK-GI-NEXT: mov v0.b[5], w8
1952 ; CHECK-GI-NEXT: ldr w8, [sp, #312]
1953 ; CHECK-GI-NEXT: mov v4.b[4], w10
1954 ; CHECK-GI-NEXT: mov v5.b[4], w9
1955 ; CHECK-GI-NEXT: ldr w9, [sp, #184]
1956 ; CHECK-GI-NEXT: ldr w10, [sp, #376]
1957 ; CHECK-GI-NEXT: mov v2.b[5], w8
1958 ; CHECK-GI-NEXT: ldr w8, [sp, #128]
1959 ; CHECK-GI-NEXT: mov v1.b[7], w7
1960 ; CHECK-GI-NEXT: mov v3.b[5], w9
1961 ; CHECK-GI-NEXT: ldr w9, [sp, #568]
1962 ; CHECK-GI-NEXT: mov v0.b[6], w8
1963 ; CHECK-GI-NEXT: ldr w8, [sp, #320]
1964 ; CHECK-GI-NEXT: mov v4.b[5], w10
1965 ; CHECK-GI-NEXT: mov v5.b[5], w9
1966 ; CHECK-GI-NEXT: ldr w9, [sp, #192]
1967 ; CHECK-GI-NEXT: ldr w10, [sp, #384]
1968 ; CHECK-GI-NEXT: mov v2.b[6], w8
1969 ; CHECK-GI-NEXT: ldr w8, [sp, #136]
1970 ; CHECK-GI-NEXT: mov v3.b[6], w9
1971 ; CHECK-GI-NEXT: ldr w9, [sp, #576]
1972 ; CHECK-GI-NEXT: mov v0.b[7], w8
1973 ; CHECK-GI-NEXT: ldr w8, [sp, #328]
1974 ; CHECK-GI-NEXT: mov v4.b[6], w10
1975 ; CHECK-GI-NEXT: ldr w10, [sp, #200]
1976 ; CHECK-GI-NEXT: mov v5.b[6], w9
1977 ; CHECK-GI-NEXT: ldr w9, [sp, #392]
1978 ; CHECK-GI-NEXT: mov v2.b[7], w8
1979 ; CHECK-GI-NEXT: ldr w8, [sp, #464]
1980 ; CHECK-GI-NEXT: mov v3.b[7], w10
1981 ; CHECK-GI-NEXT: ldr w10, [sp, #16]
1982 ; CHECK-GI-NEXT: fmov s6, w8
1983 ; CHECK-GI-NEXT: ldr w8, [sp, #208]
1984 ; CHECK-GI-NEXT: mov v4.b[7], w9
1985 ; CHECK-GI-NEXT: mov v1.b[8], w10
1986 ; CHECK-GI-NEXT: ldr w10, [sp, #656]
1987 ; CHECK-GI-NEXT: ldr w9, [sp, #472]
1988 ; CHECK-GI-NEXT: mov v5.b[7], w11
1989 ; CHECK-GI-NEXT: ldr w11, [sp, #400]
1990 ; CHECK-GI-NEXT: fmov d0, d0
1991 ; CHECK-GI-NEXT: fmov s7, w10
1992 ; CHECK-GI-NEXT: mov v6.b[1], w9
1993 ; CHECK-GI-NEXT: ldr w9, [sp, #592]
1994 ; CHECK-GI-NEXT: mov v3.b[8], w8
1995 ; CHECK-GI-NEXT: ldr w10, [sp, #664]
1996 ; CHECK-GI-NEXT: ldr w8, [sp, #24]
1997 ; CHECK-GI-NEXT: mov v4.b[8], w11
1998 ; CHECK-GI-NEXT: ldr w11, [sp, #216]
1999 ; CHECK-GI-NEXT: fmov d2, d2
2000 ; CHECK-GI-NEXT: mov v5.b[8], w9
2001 ; CHECK-GI-NEXT: ldr w9, [sp, #480]
2002 ; CHECK-GI-NEXT: mov v7.b[1], w10
2003 ; CHECK-GI-NEXT: mov v1.b[9], w8
2004 ; CHECK-GI-NEXT: ldr w8, [sp, #408]
2005 ; CHECK-GI-NEXT: ldr w10, [sp, #600]
2006 ; CHECK-GI-NEXT: mov v3.b[9], w11
2007 ; CHECK-GI-NEXT: mov v6.b[2], w9
2008 ; CHECK-GI-NEXT: ldr w9, [sp, #672]
2009 ; CHECK-GI-NEXT: ldr w11, [sp, #32]
2010 ; CHECK-GI-NEXT: mov v4.b[9], w8
2011 ; CHECK-GI-NEXT: ldr w8, [sp, #224]
2012 ; CHECK-GI-NEXT: mov v5.b[9], w10
2013 ; CHECK-GI-NEXT: ldr w10, [sp, #488]
2014 ; CHECK-GI-NEXT: mov v7.b[2], w9
2015 ; CHECK-GI-NEXT: mov v1.b[10], w11
2016 ; CHECK-GI-NEXT: ldr w9, [sp, #416]
2017 ; CHECK-GI-NEXT: ldr w11, [sp, #608]
2018 ; CHECK-GI-NEXT: mov v3.b[10], w8
2019 ; CHECK-GI-NEXT: mov v6.b[3], w10
2020 ; CHECK-GI-NEXT: ldr w10, [sp, #680]
2021 ; CHECK-GI-NEXT: ldr w8, [sp, #40]
2022 ; CHECK-GI-NEXT: mov v4.b[10], w9
2023 ; CHECK-GI-NEXT: ldr w9, [sp, #232]
2024 ; CHECK-GI-NEXT: mov v5.b[10], w11
2025 ; CHECK-GI-NEXT: ldr w11, [sp, #496]
2026 ; CHECK-GI-NEXT: mov v7.b[3], w10
2027 ; CHECK-GI-NEXT: mov v1.b[11], w8
2028 ; CHECK-GI-NEXT: ldr w8, [sp, #424]
2029 ; CHECK-GI-NEXT: ldr w10, [sp, #616]
2030 ; CHECK-GI-NEXT: mov v3.b[11], w9
2031 ; CHECK-GI-NEXT: mov v6.b[4], w11
2032 ; CHECK-GI-NEXT: ldr w11, [sp, #688]
2033 ; CHECK-GI-NEXT: ldr w9, [sp, #48]
2034 ; CHECK-GI-NEXT: mov v4.b[11], w8
2035 ; CHECK-GI-NEXT: ldr w8, [sp, #240]
2036 ; CHECK-GI-NEXT: mov v5.b[11], w10
2037 ; CHECK-GI-NEXT: ldr w10, [sp, #504]
2038 ; CHECK-GI-NEXT: mov v7.b[4], w11
2039 ; CHECK-GI-NEXT: mov v1.b[12], w9
2040 ; CHECK-GI-NEXT: ldr w9, [sp, #432]
2041 ; CHECK-GI-NEXT: ldr w11, [sp, #624]
2042 ; CHECK-GI-NEXT: mov v3.b[12], w8
2043 ; CHECK-GI-NEXT: mov v6.b[5], w10
2044 ; CHECK-GI-NEXT: ldr w10, [sp, #696]
2045 ; CHECK-GI-NEXT: ldr w8, [sp, #56]
2046 ; CHECK-GI-NEXT: mov v4.b[12], w9
2047 ; CHECK-GI-NEXT: ldr w9, [sp, #248]
2048 ; CHECK-GI-NEXT: mov v5.b[12], w11
2049 ; CHECK-GI-NEXT: ldr w11, [sp, #512]
2050 ; CHECK-GI-NEXT: mov v7.b[5], w10
2051 ; CHECK-GI-NEXT: mov v1.b[13], w8
2052 ; CHECK-GI-NEXT: ldr w8, [sp, #440]
2053 ; CHECK-GI-NEXT: ldr w10, [sp, #632]
2054 ; CHECK-GI-NEXT: mov v3.b[13], w9
2055 ; CHECK-GI-NEXT: mov v6.b[6], w11
2056 ; CHECK-GI-NEXT: ldr w11, [sp, #704]
2057 ; CHECK-GI-NEXT: ldr w9, [sp, #64]
2058 ; CHECK-GI-NEXT: mov v4.b[13], w8
2059 ; CHECK-GI-NEXT: ldr w8, [sp, #256]
2060 ; CHECK-GI-NEXT: mov v5.b[13], w10
2061 ; CHECK-GI-NEXT: ldr w10, [sp, #520]
2062 ; CHECK-GI-NEXT: mov v7.b[6], w11
2063 ; CHECK-GI-NEXT: mov v1.b[14], w9
2064 ; CHECK-GI-NEXT: ldr w9, [sp, #448]
2065 ; CHECK-GI-NEXT: ldr w11, [sp, #640]
2066 ; CHECK-GI-NEXT: mov v3.b[14], w8
2067 ; CHECK-GI-NEXT: mov v6.b[7], w10
2068 ; CHECK-GI-NEXT: ldr w10, [sp, #712]
2069 ; CHECK-GI-NEXT: ldr w8, [sp, #72]
2070 ; CHECK-GI-NEXT: mov v4.b[14], w9
2071 ; CHECK-GI-NEXT: ldr w9, [sp, #264]
2072 ; CHECK-GI-NEXT: mov v5.b[14], w11
2073 ; CHECK-GI-NEXT: mov v7.b[7], w10
2074 ; CHECK-GI-NEXT: sdot v18.4s, v0.16b, v2.16b
2075 ; CHECK-GI-NEXT: mov v1.b[15], w8
2076 ; CHECK-GI-NEXT: ldr w8, [sp, #456]
2077 ; CHECK-GI-NEXT: mov v3.b[15], w9
2078 ; CHECK-GI-NEXT: ldr w9, [sp, #648]
2079 ; CHECK-GI-NEXT: fmov d6, d6
2080 ; CHECK-GI-NEXT: mov v4.b[15], w8
2081 ; CHECK-GI-NEXT: mov v5.b[15], w9
2082 ; CHECK-GI-NEXT: fmov d7, d7
2083 ; CHECK-GI-NEXT: sdot v17.4s, v1.16b, v3.16b
2084 ; CHECK-GI-NEXT: sdot v19.4s, v4.16b, v5.16b
2085 ; CHECK-GI-NEXT: sdot v16.4s, v6.16b, v7.16b
2086 ; CHECK-GI-NEXT: add v0.4s, v17.4s, v18.4s
2087 ; CHECK-GI-NEXT: add v1.4s, v19.4s, v16.4s
2088 ; CHECK-GI-NEXT: addv s0, v0.4s
2089 ; CHECK-GI-NEXT: addv s1, v1.4s
2090 ; CHECK-GI-NEXT: fmov w8, s0
2091 ; CHECK-GI-NEXT: fmov w9, s1
2092 ; CHECK-GI-NEXT: add w0, w8, w9
2093 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2094 ; CHECK-GI-NEXT: ret
2096 %az = sext <24 x i8> %a to <24 x i32>
2097 %bz = sext <24 x i8> %b to <24 x i32>
2098 %m1 = mul nuw nsw <24 x i32> %az, %bz
2099 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m1)
2100 %cz = sext <24 x i8> %c to <24 x i32>
2101 %dz = sext <24 x i8> %d to <24 x i32>
2102 %m2 = mul nuw nsw <24 x i32> %cz, %dz
2103 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m2)
2104 %x = add i32 %r1, %r2
2108 define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
2109 ; CHECK-SD-LABEL: test_sdot_v24i8_double_nomla:
2110 ; CHECK-SD: // %bb.0: // %entry
2111 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2112 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
2113 ; CHECK-SD-NEXT: .cfi_offset w29, -16
2114 ; CHECK-SD-NEXT: fmov s0, w0
2115 ; CHECK-SD-NEXT: ldr b1, [sp, #336]
2116 ; CHECK-SD-NEXT: add x8, sp, #344
2117 ; CHECK-SD-NEXT: add x9, sp, #400
2118 ; CHECK-SD-NEXT: ldr b2, [sp, #80]
2119 ; CHECK-SD-NEXT: ldr b3, [sp, #464]
2120 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
2121 ; CHECK-SD-NEXT: add x8, sp, #352
2122 ; CHECK-SD-NEXT: add x10, sp, #408
2123 ; CHECK-SD-NEXT: mov v0.b[1], w1
2124 ; CHECK-SD-NEXT: add x11, sp, #472
2125 ; CHECK-SD-NEXT: add x12, sp, #480
2126 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x11]
2127 ; CHECK-SD-NEXT: add x11, sp, #416
2128 ; CHECK-SD-NEXT: add x13, sp, #488
2129 ; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8]
2130 ; CHECK-SD-NEXT: add x8, sp, #360
2131 ; CHECK-SD-NEXT: add x14, sp, #496
2132 ; CHECK-SD-NEXT: movi v4.16b, #1
2133 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
2134 ; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
2135 ; CHECK-SD-NEXT: mov v0.b[2], w2
2136 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x12]
2137 ; CHECK-SD-NEXT: add x12, sp, #424
2138 ; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
2139 ; CHECK-SD-NEXT: add x8, sp, #368
2140 ; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
2141 ; CHECK-SD-NEXT: movi v16.8b, #1
2142 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
2143 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x13]
2144 ; CHECK-SD-NEXT: add x13, sp, #432
2145 ; CHECK-SD-NEXT: mov v0.b[3], w3
2146 ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8]
2147 ; CHECK-SD-NEXT: add x8, sp, #376
2148 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x14]
2149 ; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
2150 ; CHECK-SD-NEXT: add x8, sp, #384
2151 ; CHECK-SD-NEXT: mov v0.b[4], w4
2152 ; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8]
2153 ; CHECK-SD-NEXT: add x8, sp, #392
2154 ; CHECK-SD-NEXT: mov v0.b[5], w5
2155 ; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8]
2156 ; CHECK-SD-NEXT: add x8, sp, #16
2157 ; CHECK-SD-NEXT: mov v0.b[6], w6
2158 ; CHECK-SD-NEXT: ld1 { v1.b }[8], [x9]
2159 ; CHECK-SD-NEXT: add x9, sp, #88
2160 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9]
2161 ; CHECK-SD-NEXT: add x9, sp, #40
2162 ; CHECK-SD-NEXT: ld1 { v1.b }[9], [x10]
2163 ; CHECK-SD-NEXT: add x10, sp, #96
2164 ; CHECK-SD-NEXT: mov v0.b[7], w7
2165 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x10]
2166 ; CHECK-SD-NEXT: add x10, sp, #56
2167 ; CHECK-SD-NEXT: ld1 { v1.b }[10], [x11]
2168 ; CHECK-SD-NEXT: add x11, sp, #104
2169 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x11]
2170 ; CHECK-SD-NEXT: add x11, sp, #72
2171 ; CHECK-SD-NEXT: ld1 { v0.b }[8], [x8]
2172 ; CHECK-SD-NEXT: add x8, sp, #24
2173 ; CHECK-SD-NEXT: ld1 { v1.b }[11], [x12]
2174 ; CHECK-SD-NEXT: add x12, sp, #112
2175 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
2176 ; CHECK-SD-NEXT: add x12, sp, #440
2177 ; CHECK-SD-NEXT: ld1 { v0.b }[9], [x8]
2178 ; CHECK-SD-NEXT: add x8, sp, #32
2179 ; CHECK-SD-NEXT: ld1 { v1.b }[12], [x13]
2180 ; CHECK-SD-NEXT: add x13, sp, #504
2181 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x13]
2182 ; CHECK-SD-NEXT: add x13, sp, #512
2183 ; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8]
2184 ; CHECK-SD-NEXT: add x8, sp, #48
2185 ; CHECK-SD-NEXT: ld1 { v1.b }[13], [x12]
2186 ; CHECK-SD-NEXT: add x12, sp, #448
2187 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x13]
2188 ; CHECK-SD-NEXT: ld1 { v0.b }[11], [x9]
2189 ; CHECK-SD-NEXT: add x9, sp, #64
2190 ; CHECK-SD-NEXT: ld1 { v1.b }[14], [x12]
2191 ; CHECK-SD-NEXT: ld1 { v0.b }[12], [x8]
2192 ; CHECK-SD-NEXT: add x8, sp, #120
2193 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
2194 ; CHECK-SD-NEXT: add x8, sp, #128
2195 ; CHECK-SD-NEXT: ld1 { v0.b }[13], [x10]
2196 ; CHECK-SD-NEXT: add x10, sp, #136
2197 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
2198 ; CHECK-SD-NEXT: add x8, sp, #456
2199 ; CHECK-SD-NEXT: ld1 { v1.b }[15], [x8]
2200 ; CHECK-SD-NEXT: ld1 { v0.b }[14], [x9]
2201 ; CHECK-SD-NEXT: add x9, sp, #520
2202 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x10]
2203 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x9]
2204 ; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v4.16b
2205 ; CHECK-SD-NEXT: ld1 { v0.b }[15], [x11]
2206 ; CHECK-SD-NEXT: sdot v17.2s, v2.8b, v16.8b
2207 ; CHECK-SD-NEXT: sdot v7.2s, v3.8b, v16.8b
2208 ; CHECK-SD-NEXT: sdot v6.4s, v0.16b, v4.16b
2209 ; CHECK-SD-NEXT: addv s3, v5.4s
2210 ; CHECK-SD-NEXT: addp v1.2s, v17.2s, v17.2s
2211 ; CHECK-SD-NEXT: addp v2.2s, v7.2s, v7.2s
2212 ; CHECK-SD-NEXT: fmov w10, s3
2213 ; CHECK-SD-NEXT: addv s0, v6.4s
2214 ; CHECK-SD-NEXT: fmov w9, s1
2215 ; CHECK-SD-NEXT: fmov w11, s2
2216 ; CHECK-SD-NEXT: fmov w8, s0
2217 ; CHECK-SD-NEXT: add w8, w8, w9
2218 ; CHECK-SD-NEXT: add w9, w10, w11
2219 ; CHECK-SD-NEXT: add w0, w8, w9
2220 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2221 ; CHECK-SD-NEXT: ret
2223 ; CHECK-GI-LABEL: test_sdot_v24i8_double_nomla:
2224 ; CHECK-GI: // %bb.0: // %entry
2225 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2226 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
2227 ; CHECK-GI-NEXT: .cfi_offset w29, -16
2228 ; CHECK-GI-NEXT: ldr w9, [sp, #336]
2229 ; CHECK-GI-NEXT: ldr w8, [sp, #344]
2230 ; CHECK-GI-NEXT: fmov s0, w0
2231 ; CHECK-GI-NEXT: ldr w10, [sp, #16]
2232 ; CHECK-GI-NEXT: ldr w11, [sp, #88]
2233 ; CHECK-GI-NEXT: movi v4.8b, #1
2234 ; CHECK-GI-NEXT: fmov s1, w9
2235 ; CHECK-GI-NEXT: ldr w9, [sp, #464]
2236 ; CHECK-GI-NEXT: ldr w12, [sp, #400]
2237 ; CHECK-GI-NEXT: mov v0.b[1], w1
2238 ; CHECK-GI-NEXT: movi v5.8b, #1
2239 ; CHECK-GI-NEXT: movi v6.8b, #1
2240 ; CHECK-GI-NEXT: fmov s2, w9
2241 ; CHECK-GI-NEXT: ldr w9, [sp, #96]
2242 ; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
2243 ; CHECK-GI-NEXT: mov v1.b[1], w8
2244 ; CHECK-GI-NEXT: ldr w8, [sp, #352]
2245 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
2246 ; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
2247 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
2248 ; CHECK-GI-NEXT: mov v0.b[2], w2
2249 ; CHECK-GI-NEXT: mov v5.d[1], v4.d[0]
2250 ; CHECK-GI-NEXT: mov v6.d[1], v4.d[0]
2251 ; CHECK-GI-NEXT: mov v1.b[2], w8
2252 ; CHECK-GI-NEXT: ldr w8, [sp, #360]
2253 ; CHECK-GI-NEXT: mov v0.b[3], w3
2254 ; CHECK-GI-NEXT: mov v1.b[3], w8
2255 ; CHECK-GI-NEXT: ldr w8, [sp, #368]
2256 ; CHECK-GI-NEXT: mov v0.b[4], w4
2257 ; CHECK-GI-NEXT: mov v1.b[4], w8
2258 ; CHECK-GI-NEXT: ldr w8, [sp, #376]
2259 ; CHECK-GI-NEXT: mov v0.b[5], w5
2260 ; CHECK-GI-NEXT: mov v1.b[5], w8
2261 ; CHECK-GI-NEXT: ldr w8, [sp, #384]
2262 ; CHECK-GI-NEXT: mov v0.b[6], w6
2263 ; CHECK-GI-NEXT: mov v1.b[6], w8
2264 ; CHECK-GI-NEXT: ldr w8, [sp, #392]
2265 ; CHECK-GI-NEXT: mov v0.b[7], w7
2266 ; CHECK-GI-NEXT: mov v1.b[7], w8
2267 ; CHECK-GI-NEXT: ldr w8, [sp, #80]
2268 ; CHECK-GI-NEXT: fmov s3, w8
2269 ; CHECK-GI-NEXT: ldr w8, [sp, #472]
2270 ; CHECK-GI-NEXT: mov v0.b[8], w10
2271 ; CHECK-GI-NEXT: ldr w10, [sp, #408]
2272 ; CHECK-GI-NEXT: mov v1.b[8], w12
2273 ; CHECK-GI-NEXT: mov v2.b[1], w8
2274 ; CHECK-GI-NEXT: ldr w8, [sp, #24]
2275 ; CHECK-GI-NEXT: mov v3.b[1], w11
2276 ; CHECK-GI-NEXT: ldr w11, [sp, #480]
2277 ; CHECK-GI-NEXT: mov v0.b[9], w8
2278 ; CHECK-GI-NEXT: ldr w8, [sp, #32]
2279 ; CHECK-GI-NEXT: mov v1.b[9], w10
2280 ; CHECK-GI-NEXT: mov v2.b[2], w11
2281 ; CHECK-GI-NEXT: ldr w10, [sp, #416]
2282 ; CHECK-GI-NEXT: mov v3.b[2], w9
2283 ; CHECK-GI-NEXT: ldr w9, [sp, #104]
2284 ; CHECK-GI-NEXT: ldr w11, [sp, #488]
2285 ; CHECK-GI-NEXT: mov v0.b[10], w8
2286 ; CHECK-GI-NEXT: ldr w8, [sp, #40]
2287 ; CHECK-GI-NEXT: mov v1.b[10], w10
2288 ; CHECK-GI-NEXT: mov v2.b[3], w11
2289 ; CHECK-GI-NEXT: ldr w10, [sp, #424]
2290 ; CHECK-GI-NEXT: mov v3.b[3], w9
2291 ; CHECK-GI-NEXT: ldr w9, [sp, #112]
2292 ; CHECK-GI-NEXT: ldr w11, [sp, #496]
2293 ; CHECK-GI-NEXT: mov v0.b[11], w8
2294 ; CHECK-GI-NEXT: ldr w8, [sp, #48]
2295 ; CHECK-GI-NEXT: mov v1.b[11], w10
2296 ; CHECK-GI-NEXT: mov v2.b[4], w11
2297 ; CHECK-GI-NEXT: ldr w10, [sp, #432]
2298 ; CHECK-GI-NEXT: mov v3.b[4], w9
2299 ; CHECK-GI-NEXT: ldr w9, [sp, #120]
2300 ; CHECK-GI-NEXT: ldr w11, [sp, #504]
2301 ; CHECK-GI-NEXT: mov v0.b[12], w8
2302 ; CHECK-GI-NEXT: ldr w8, [sp, #56]
2303 ; CHECK-GI-NEXT: mov v1.b[12], w10
2304 ; CHECK-GI-NEXT: mov v2.b[5], w11
2305 ; CHECK-GI-NEXT: ldr w10, [sp, #440]
2306 ; CHECK-GI-NEXT: mov v3.b[5], w9
2307 ; CHECK-GI-NEXT: ldr w9, [sp, #128]
2308 ; CHECK-GI-NEXT: ldr w11, [sp, #512]
2309 ; CHECK-GI-NEXT: mov v0.b[13], w8
2310 ; CHECK-GI-NEXT: ldr w8, [sp, #64]
2311 ; CHECK-GI-NEXT: mov v1.b[13], w10
2312 ; CHECK-GI-NEXT: mov v2.b[6], w11
2313 ; CHECK-GI-NEXT: ldr w10, [sp, #448]
2314 ; CHECK-GI-NEXT: mov v3.b[6], w9
2315 ; CHECK-GI-NEXT: ldr w9, [sp, #136]
2316 ; CHECK-GI-NEXT: ldr w11, [sp, #520]
2317 ; CHECK-GI-NEXT: mov v0.b[14], w8
2318 ; CHECK-GI-NEXT: ldr w8, [sp, #72]
2319 ; CHECK-GI-NEXT: mov v1.b[14], w10
2320 ; CHECK-GI-NEXT: mov v2.b[7], w11
2321 ; CHECK-GI-NEXT: mov v3.b[7], w9
2322 ; CHECK-GI-NEXT: ldr w9, [sp, #456]
2323 ; CHECK-GI-NEXT: mov v0.b[15], w8
2324 ; CHECK-GI-NEXT: mov v1.b[15], w9
2325 ; CHECK-GI-NEXT: fmov d2, d2
2326 ; CHECK-GI-NEXT: fmov d3, d3
2327 ; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v5.16b
2328 ; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v6.16b
2329 ; CHECK-GI-NEXT: sdot v7.4s, v2.16b, v4.16b
2330 ; CHECK-GI-NEXT: sdot v17.4s, v3.16b, v4.16b
2331 ; CHECK-GI-NEXT: add v1.4s, v18.4s, v7.4s
2332 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
2333 ; CHECK-GI-NEXT: addv s1, v1.4s
2334 ; CHECK-GI-NEXT: addv s0, v0.4s
2335 ; CHECK-GI-NEXT: fmov w9, s1
2336 ; CHECK-GI-NEXT: fmov w8, s0
2337 ; CHECK-GI-NEXT: add w0, w8, w9
2338 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2339 ; CHECK-GI-NEXT: ret
2341 %az = sext <24 x i8> %a to <24 x i32>
2342 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %az)
2343 %cz = sext <24 x i8> %c to <24 x i32>
2344 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %cz)
2345 %x = add i32 %r1, %r2
2349 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2350 ; CHECK-SD-LABEL: test_udot_v25i8:
2351 ; CHECK-SD: // %bb.0: // %entry
2352 ; CHECK-SD-NEXT: ldp q3, q0, [x1]
2353 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
2354 ; CHECK-SD-NEXT: ldp q2, q1, [x0]
2355 ; CHECK-SD-NEXT: umull2 v4.8h, v0.16b, v1.16b
2356 ; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b
2357 ; CHECK-SD-NEXT: umull v1.8h, v3.8b, v2.8b
2358 ; CHECK-SD-NEXT: umull2 v2.8h, v3.16b, v2.16b
2359 ; CHECK-SD-NEXT: ushll v3.4s, v4.4h, #0
2360 ; CHECK-SD-NEXT: uaddl2 v4.4s, v1.8h, v0.8h
2361 ; CHECK-SD-NEXT: uaddl v0.4s, v1.4h, v0.4h
2362 ; CHECK-SD-NEXT: mov v5.s[0], v3.s[0]
2363 ; CHECK-SD-NEXT: uaddw2 v1.4s, v4.4s, v2.8h
2364 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
2365 ; CHECK-SD-NEXT: uaddw v2.4s, v5.4s, v2.4h
2366 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
2367 ; CHECK-SD-NEXT: addv s0, v0.4s
2368 ; CHECK-SD-NEXT: fmov w8, s0
2369 ; CHECK-SD-NEXT: add w0, w8, w2
2370 ; CHECK-SD-NEXT: ret
2372 ; CHECK-GI-LABEL: test_udot_v25i8:
2373 ; CHECK-GI: // %bb.0: // %entry
2374 ; CHECK-GI-NEXT: ldr q1, [x1]
2375 ; CHECK-GI-NEXT: ldrb w11, [x1, #16]!
2376 ; CHECK-GI-NEXT: ldrb w12, [x1, #4]
2377 ; CHECK-GI-NEXT: ldr q0, [x0]
2378 ; CHECK-GI-NEXT: mov v23.s[0], wzr
2379 ; CHECK-GI-NEXT: umov w9, v1.b[4]
2380 ; CHECK-GI-NEXT: umov w10, v1.b[12]
2381 ; CHECK-GI-NEXT: umov w13, v1.b[0]
2382 ; CHECK-GI-NEXT: umov w14, v1.b[5]
2383 ; CHECK-GI-NEXT: mov v5.s[0], w12
2384 ; CHECK-GI-NEXT: mov v3.s[0], w11
2385 ; CHECK-GI-NEXT: umov w11, v0.b[0]
2386 ; CHECK-GI-NEXT: umov w12, v1.b[1]
2387 ; CHECK-GI-NEXT: umov w15, v1.b[8]
2388 ; CHECK-GI-NEXT: ldrb w8, [x0, #16]!
2389 ; CHECK-GI-NEXT: mov v23.s[1], wzr
2390 ; CHECK-GI-NEXT: mov v2.s[0], w9
2391 ; CHECK-GI-NEXT: mov v4.s[0], w10
2392 ; CHECK-GI-NEXT: umov w10, v1.b[13]
2393 ; CHECK-GI-NEXT: ldrb w9, [x1, #5]
2394 ; CHECK-GI-NEXT: mov v6.s[0], w13
2395 ; CHECK-GI-NEXT: umov w13, v1.b[6]
2396 ; CHECK-GI-NEXT: mov v16.s[0], w11
2397 ; CHECK-GI-NEXT: umov w11, v1.b[2]
2398 ; CHECK-GI-NEXT: mov v7.s[0], w15
2399 ; CHECK-GI-NEXT: mov v5.s[1], w9
2400 ; CHECK-GI-NEXT: ldrb w9, [x1, #6]
2401 ; CHECK-GI-NEXT: umov w15, v1.b[9]
2402 ; CHECK-GI-NEXT: mov v2.s[1], w14
2403 ; CHECK-GI-NEXT: ldrb w14, [x1, #1]
2404 ; CHECK-GI-NEXT: mov v4.s[1], w10
2405 ; CHECK-GI-NEXT: umov w10, v1.b[14]
2406 ; CHECK-GI-NEXT: mov v6.s[1], w12
2407 ; CHECK-GI-NEXT: umov w12, v0.b[1]
2408 ; CHECK-GI-NEXT: mov v3.s[1], w14
2409 ; CHECK-GI-NEXT: umov w14, v0.b[12]
2410 ; CHECK-GI-NEXT: mov v21.s[0], w8
2411 ; CHECK-GI-NEXT: ldrb w8, [x0, #1]
2412 ; CHECK-GI-NEXT: mov v5.s[2], w9
2413 ; CHECK-GI-NEXT: umov w9, v0.b[4]
2414 ; CHECK-GI-NEXT: mov v2.s[2], w13
2415 ; CHECK-GI-NEXT: umov w13, v1.b[7]
2416 ; CHECK-GI-NEXT: mov v7.s[1], w15
2417 ; CHECK-GI-NEXT: mov v4.s[2], w10
2418 ; CHECK-GI-NEXT: umov w10, v1.b[15]
2419 ; CHECK-GI-NEXT: mov v16.s[1], w12
2420 ; CHECK-GI-NEXT: ldrb w12, [x1, #2]
2421 ; CHECK-GI-NEXT: mov v6.s[2], w11
2422 ; CHECK-GI-NEXT: umov w11, v0.b[2]
2423 ; CHECK-GI-NEXT: mov v17.s[0], w9
2424 ; CHECK-GI-NEXT: umov w9, v0.b[8]
2425 ; CHECK-GI-NEXT: mov v18.s[0], w14
2426 ; CHECK-GI-NEXT: mov v2.s[3], w13
2427 ; CHECK-GI-NEXT: ldrb w13, [x1, #7]
2428 ; CHECK-GI-NEXT: mov v3.s[2], w12
2429 ; CHECK-GI-NEXT: ldrb w12, [x0, #4]
2430 ; CHECK-GI-NEXT: mov v4.s[3], w10
2431 ; CHECK-GI-NEXT: umov w10, v0.b[5]
2432 ; CHECK-GI-NEXT: mov v5.s[3], w13
2433 ; CHECK-GI-NEXT: ldrb w13, [x0, #8]
2434 ; CHECK-GI-NEXT: mov v16.s[2], w11
2435 ; CHECK-GI-NEXT: umov w11, v0.b[13]
2436 ; CHECK-GI-NEXT: mov v20.s[0], w12
2437 ; CHECK-GI-NEXT: ldrb w12, [x1, #8]
2438 ; CHECK-GI-NEXT: mov v19.s[0], w9
2439 ; CHECK-GI-NEXT: umov w9, v0.b[6]
2440 ; CHECK-GI-NEXT: umov w15, v1.b[10]
2441 ; CHECK-GI-NEXT: mul w12, w12, w13
2442 ; CHECK-GI-NEXT: mov v17.s[1], w10
2443 ; CHECK-GI-NEXT: ldrb w10, [x0, #5]
2444 ; CHECK-GI-NEXT: umov w13, v0.b[9]
2445 ; CHECK-GI-NEXT: mov v21.s[1], w8
2446 ; CHECK-GI-NEXT: umov w8, v1.b[11]
2447 ; CHECK-GI-NEXT: mov v18.s[1], w11
2448 ; CHECK-GI-NEXT: umov w11, v0.b[14]
2449 ; CHECK-GI-NEXT: mov v20.s[1], w10
2450 ; CHECK-GI-NEXT: ldrb w10, [x0, #6]
2451 ; CHECK-GI-NEXT: mov v22.s[0], w12
2452 ; CHECK-GI-NEXT: umov w12, v0.b[7]
2453 ; CHECK-GI-NEXT: mov v17.s[2], w9
2454 ; CHECK-GI-NEXT: umov w9, v0.b[10]
2455 ; CHECK-GI-NEXT: mov v7.s[2], w15
2456 ; CHECK-GI-NEXT: mov v19.s[1], w13
2457 ; CHECK-GI-NEXT: umov w13, v1.b[3]
2458 ; CHECK-GI-NEXT: mov v23.s[2], wzr
2459 ; CHECK-GI-NEXT: mov v18.s[2], w11
2460 ; CHECK-GI-NEXT: umov w11, v0.b[15]
2461 ; CHECK-GI-NEXT: mov v20.s[2], w10
2462 ; CHECK-GI-NEXT: ldrb w10, [x0, #2]
2463 ; CHECK-GI-NEXT: mov v22.s[1], wzr
2464 ; CHECK-GI-NEXT: mov v17.s[3], w12
2465 ; CHECK-GI-NEXT: ldrb w12, [x0, #7]
2466 ; CHECK-GI-NEXT: mov v7.s[3], w8
2467 ; CHECK-GI-NEXT: ldrb w8, [x0, #3]
2468 ; CHECK-GI-NEXT: mov v19.s[2], w9
2469 ; CHECK-GI-NEXT: umov w9, v0.b[3]
2470 ; CHECK-GI-NEXT: mov v18.s[3], w11
2471 ; CHECK-GI-NEXT: umov w11, v0.b[11]
2472 ; CHECK-GI-NEXT: mov v21.s[2], w10
2473 ; CHECK-GI-NEXT: ldrb w10, [x1, #3]
2474 ; CHECK-GI-NEXT: mov v20.s[3], w12
2475 ; CHECK-GI-NEXT: mov v22.s[2], wzr
2476 ; CHECK-GI-NEXT: mov v6.s[3], w13
2477 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v17.4s
2478 ; CHECK-GI-NEXT: mov v23.s[3], wzr
2479 ; CHECK-GI-NEXT: mov v3.s[3], w10
2480 ; CHECK-GI-NEXT: mov v16.s[3], w9
2481 ; CHECK-GI-NEXT: mov v19.s[3], w11
2482 ; CHECK-GI-NEXT: mul v1.4s, v4.4s, v18.4s
2483 ; CHECK-GI-NEXT: mov v21.s[3], w8
2484 ; CHECK-GI-NEXT: mul v2.4s, v5.4s, v20.4s
2485 ; CHECK-GI-NEXT: mov v22.s[3], wzr
2486 ; CHECK-GI-NEXT: mla v0.4s, v6.4s, v16.4s
2487 ; CHECK-GI-NEXT: mla v1.4s, v7.4s, v19.4s
2488 ; CHECK-GI-NEXT: mla v2.4s, v3.4s, v21.4s
2489 ; CHECK-GI-NEXT: add v3.4s, v22.4s, v23.4s
2490 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
2491 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
2492 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
2493 ; CHECK-GI-NEXT: addv s0, v0.4s
2494 ; CHECK-GI-NEXT: fmov w8, s0
2495 ; CHECK-GI-NEXT: add w0, w8, w2
2496 ; CHECK-GI-NEXT: ret
2498 %0 = load <25 x i8>, ptr %a
2499 %1 = zext <25 x i8> %0 to <25 x i32>
2500 %2 = load <25 x i8>, ptr %b
2501 %3 = zext <25 x i8> %2 to <25 x i32>
2502 %4 = mul nuw nsw <25 x i32> %3, %1
2503 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
2504 %op.extra = add i32 %5, %sum
2508 define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
2509 ; CHECK-SD-LABEL: test_udot_v25i8_nomla:
2510 ; CHECK-SD: // %bb.0: // %entry
2511 ; CHECK-SD-NEXT: ldp q2, q1, [x0]
2512 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
2513 ; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
2514 ; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
2515 ; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
2516 ; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
2517 ; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
2518 ; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h
2519 ; CHECK-SD-NEXT: uaddl v1.4s, v4.4h, v1.4h
2520 ; CHECK-SD-NEXT: mov v0.s[0], v3.s[0]
2521 ; CHECK-SD-NEXT: uaddw2 v3.4s, v5.4s, v2.8h
2522 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s
2523 ; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v2.4h
2524 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
2525 ; CHECK-SD-NEXT: addv s0, v0.4s
2526 ; CHECK-SD-NEXT: fmov w0, s0
2527 ; CHECK-SD-NEXT: ret
2529 ; CHECK-GI-LABEL: test_udot_v25i8_nomla:
2530 ; CHECK-GI: // %bb.0: // %entry
2531 ; CHECK-GI-NEXT: ldr q1, [x0]
2532 ; CHECK-GI-NEXT: ldrb w11, [x0, #16]!
2533 ; CHECK-GI-NEXT: ldrb w14, [x0, #4]
2534 ; CHECK-GI-NEXT: ldrb w17, [x0, #8]
2535 ; CHECK-GI-NEXT: mov v0.s[0], wzr
2536 ; CHECK-GI-NEXT: umov w12, v1.b[0]
2537 ; CHECK-GI-NEXT: umov w13, v1.b[4]
2538 ; CHECK-GI-NEXT: umov w15, v1.b[8]
2539 ; CHECK-GI-NEXT: umov w16, v1.b[12]
2540 ; CHECK-GI-NEXT: umov w18, v1.b[1]
2541 ; CHECK-GI-NEXT: umov w1, v1.b[5]
2542 ; CHECK-GI-NEXT: umov w2, v1.b[9]
2543 ; CHECK-GI-NEXT: umov w3, v1.b[13]
2544 ; CHECK-GI-NEXT: mov v4.s[0], w11
2545 ; CHECK-GI-NEXT: mov v7.s[0], w14
2546 ; CHECK-GI-NEXT: mov v16.s[0], w17
2547 ; CHECK-GI-NEXT: ldrb w8, [x0, #1]
2548 ; CHECK-GI-NEXT: mov v2.s[0], w12
2549 ; CHECK-GI-NEXT: mov v3.s[0], w13
2550 ; CHECK-GI-NEXT: ldrb w10, [x0, #5]
2551 ; CHECK-GI-NEXT: mov v5.s[0], w15
2552 ; CHECK-GI-NEXT: mov v6.s[0], w16
2553 ; CHECK-GI-NEXT: ldrb w16, [x0, #2]
2554 ; CHECK-GI-NEXT: umov w9, v1.b[2]
2555 ; CHECK-GI-NEXT: umov w12, v1.b[6]
2556 ; CHECK-GI-NEXT: ldrb w17, [x0, #6]
2557 ; CHECK-GI-NEXT: umov w14, v1.b[10]
2558 ; CHECK-GI-NEXT: umov w15, v1.b[14]
2559 ; CHECK-GI-NEXT: mov v4.s[1], w8
2560 ; CHECK-GI-NEXT: mov v2.s[1], w18
2561 ; CHECK-GI-NEXT: mov v3.s[1], w1
2562 ; CHECK-GI-NEXT: mov v7.s[1], w10
2563 ; CHECK-GI-NEXT: mov v5.s[1], w2
2564 ; CHECK-GI-NEXT: mov v6.s[1], w3
2565 ; CHECK-GI-NEXT: mov v16.s[1], wzr
2566 ; CHECK-GI-NEXT: mov v0.s[1], wzr
2567 ; CHECK-GI-NEXT: umov w11, v1.b[3]
2568 ; CHECK-GI-NEXT: umov w13, v1.b[7]
2569 ; CHECK-GI-NEXT: umov w8, v1.b[11]
2570 ; CHECK-GI-NEXT: umov w10, v1.b[15]
2571 ; CHECK-GI-NEXT: mov v4.s[2], w16
2572 ; CHECK-GI-NEXT: mov v2.s[2], w9
2573 ; CHECK-GI-NEXT: ldrb w9, [x0, #3]
2574 ; CHECK-GI-NEXT: mov v3.s[2], w12
2575 ; CHECK-GI-NEXT: ldrb w12, [x0, #7]
2576 ; CHECK-GI-NEXT: mov v5.s[2], w14
2577 ; CHECK-GI-NEXT: mov v6.s[2], w15
2578 ; CHECK-GI-NEXT: mov v7.s[2], w17
2579 ; CHECK-GI-NEXT: mov v16.s[2], wzr
2580 ; CHECK-GI-NEXT: mov v0.s[2], wzr
2581 ; CHECK-GI-NEXT: mov v4.s[3], w9
2582 ; CHECK-GI-NEXT: mov v2.s[3], w11
2583 ; CHECK-GI-NEXT: mov v3.s[3], w13
2584 ; CHECK-GI-NEXT: mov v5.s[3], w8
2585 ; CHECK-GI-NEXT: mov v6.s[3], w10
2586 ; CHECK-GI-NEXT: mov v7.s[3], w12
2587 ; CHECK-GI-NEXT: mov v16.s[3], wzr
2588 ; CHECK-GI-NEXT: mov v0.s[3], wzr
2589 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
2590 ; CHECK-GI-NEXT: add v2.4s, v5.4s, v6.4s
2591 ; CHECK-GI-NEXT: add v3.4s, v4.4s, v7.4s
2592 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v0.4s
2593 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
2594 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
2595 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
2596 ; CHECK-GI-NEXT: addv s0, v0.4s
2597 ; CHECK-GI-NEXT: fmov w0, s0
2598 ; CHECK-GI-NEXT: ret
2600 %0 = load <25 x i8>, ptr %a1
2601 %1 = zext <25 x i8> %0 to <25 x i32>
2602 %2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %1)
2605 define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2606 ; CHECK-SD-LABEL: test_sdot_v25i8:
2607 ; CHECK-SD: // %bb.0: // %entry
2608 ; CHECK-SD-NEXT: ldp q3, q0, [x1]
2609 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
2610 ; CHECK-SD-NEXT: ldp q2, q1, [x0]
2611 ; CHECK-SD-NEXT: smull2 v4.8h, v0.16b, v1.16b
2612 ; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b
2613 ; CHECK-SD-NEXT: smull v1.8h, v3.8b, v2.8b
2614 ; CHECK-SD-NEXT: smull2 v2.8h, v3.16b, v2.16b
2615 ; CHECK-SD-NEXT: sshll v3.4s, v4.4h, #0
2616 ; CHECK-SD-NEXT: saddl2 v4.4s, v1.8h, v0.8h
2617 ; CHECK-SD-NEXT: saddl v0.4s, v1.4h, v0.4h
2618 ; CHECK-SD-NEXT: mov v5.s[0], v3.s[0]
2619 ; CHECK-SD-NEXT: saddw2 v1.4s, v4.4s, v2.8h
2620 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
2621 ; CHECK-SD-NEXT: saddw v2.4s, v5.4s, v2.4h
2622 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
2623 ; CHECK-SD-NEXT: addv s0, v0.4s
2624 ; CHECK-SD-NEXT: fmov w8, s0
2625 ; CHECK-SD-NEXT: add w0, w8, w2
2626 ; CHECK-SD-NEXT: ret
2628 ; CHECK-GI-LABEL: test_sdot_v25i8:
2629 ; CHECK-GI: // %bb.0: // %entry
2630 ; CHECK-GI-NEXT: ldr q1, [x1]
2631 ; CHECK-GI-NEXT: ldrsb w11, [x1, #16]!
2632 ; CHECK-GI-NEXT: ldrsb w12, [x1, #4]
2633 ; CHECK-GI-NEXT: ldr q0, [x0]
2634 ; CHECK-GI-NEXT: mov v23.s[0], wzr
2635 ; CHECK-GI-NEXT: smov w9, v1.b[4]
2636 ; CHECK-GI-NEXT: smov w10, v1.b[12]
2637 ; CHECK-GI-NEXT: smov w13, v1.b[0]
2638 ; CHECK-GI-NEXT: smov w14, v1.b[5]
2639 ; CHECK-GI-NEXT: mov v5.s[0], w12
2640 ; CHECK-GI-NEXT: mov v3.s[0], w11
2641 ; CHECK-GI-NEXT: smov w11, v0.b[0]
2642 ; CHECK-GI-NEXT: smov w12, v1.b[1]
2643 ; CHECK-GI-NEXT: smov w15, v1.b[8]
2644 ; CHECK-GI-NEXT: ldrsb w8, [x0, #16]!
2645 ; CHECK-GI-NEXT: mov v23.s[1], wzr
2646 ; CHECK-GI-NEXT: mov v2.s[0], w9
2647 ; CHECK-GI-NEXT: mov v4.s[0], w10
2648 ; CHECK-GI-NEXT: smov w10, v1.b[13]
2649 ; CHECK-GI-NEXT: ldrsb w9, [x1, #5]
2650 ; CHECK-GI-NEXT: mov v6.s[0], w13
2651 ; CHECK-GI-NEXT: smov w13, v1.b[6]
2652 ; CHECK-GI-NEXT: mov v16.s[0], w11
2653 ; CHECK-GI-NEXT: smov w11, v1.b[2]
2654 ; CHECK-GI-NEXT: mov v7.s[0], w15
2655 ; CHECK-GI-NEXT: mov v5.s[1], w9
2656 ; CHECK-GI-NEXT: ldrsb w9, [x1, #6]
2657 ; CHECK-GI-NEXT: smov w15, v1.b[9]
2658 ; CHECK-GI-NEXT: mov v2.s[1], w14
2659 ; CHECK-GI-NEXT: ldrsb w14, [x1, #1]
2660 ; CHECK-GI-NEXT: mov v4.s[1], w10
2661 ; CHECK-GI-NEXT: smov w10, v1.b[14]
2662 ; CHECK-GI-NEXT: mov v6.s[1], w12
2663 ; CHECK-GI-NEXT: smov w12, v0.b[1]
2664 ; CHECK-GI-NEXT: mov v3.s[1], w14
2665 ; CHECK-GI-NEXT: smov w14, v0.b[12]
2666 ; CHECK-GI-NEXT: mov v21.s[0], w8
2667 ; CHECK-GI-NEXT: ldrsb w8, [x0, #1]
2668 ; CHECK-GI-NEXT: mov v5.s[2], w9
2669 ; CHECK-GI-NEXT: smov w9, v0.b[4]
2670 ; CHECK-GI-NEXT: mov v2.s[2], w13
2671 ; CHECK-GI-NEXT: smov w13, v1.b[7]
2672 ; CHECK-GI-NEXT: mov v7.s[1], w15
2673 ; CHECK-GI-NEXT: mov v4.s[2], w10
2674 ; CHECK-GI-NEXT: smov w10, v1.b[15]
2675 ; CHECK-GI-NEXT: mov v16.s[1], w12
2676 ; CHECK-GI-NEXT: ldrsb w12, [x1, #2]
2677 ; CHECK-GI-NEXT: mov v6.s[2], w11
2678 ; CHECK-GI-NEXT: smov w11, v0.b[2]
2679 ; CHECK-GI-NEXT: mov v17.s[0], w9
2680 ; CHECK-GI-NEXT: smov w9, v0.b[8]
2681 ; CHECK-GI-NEXT: mov v18.s[0], w14
2682 ; CHECK-GI-NEXT: mov v2.s[3], w13
2683 ; CHECK-GI-NEXT: ldrsb w13, [x1, #7]
2684 ; CHECK-GI-NEXT: mov v3.s[2], w12
2685 ; CHECK-GI-NEXT: ldrsb w12, [x0, #4]
2686 ; CHECK-GI-NEXT: mov v4.s[3], w10
2687 ; CHECK-GI-NEXT: smov w10, v0.b[5]
2688 ; CHECK-GI-NEXT: mov v5.s[3], w13
2689 ; CHECK-GI-NEXT: ldrsb w13, [x0, #8]
2690 ; CHECK-GI-NEXT: mov v16.s[2], w11
2691 ; CHECK-GI-NEXT: smov w11, v0.b[13]
2692 ; CHECK-GI-NEXT: mov v20.s[0], w12
2693 ; CHECK-GI-NEXT: ldrsb w12, [x1, #8]
2694 ; CHECK-GI-NEXT: mov v19.s[0], w9
2695 ; CHECK-GI-NEXT: smov w9, v0.b[6]
2696 ; CHECK-GI-NEXT: smov w15, v1.b[10]
2697 ; CHECK-GI-NEXT: mul w12, w12, w13
2698 ; CHECK-GI-NEXT: mov v17.s[1], w10
2699 ; CHECK-GI-NEXT: ldrsb w10, [x0, #5]
2700 ; CHECK-GI-NEXT: smov w13, v0.b[9]
2701 ; CHECK-GI-NEXT: mov v21.s[1], w8
2702 ; CHECK-GI-NEXT: smov w8, v1.b[11]
2703 ; CHECK-GI-NEXT: mov v18.s[1], w11
2704 ; CHECK-GI-NEXT: smov w11, v0.b[14]
2705 ; CHECK-GI-NEXT: mov v20.s[1], w10
2706 ; CHECK-GI-NEXT: ldrsb w10, [x0, #6]
2707 ; CHECK-GI-NEXT: mov v22.s[0], w12
2708 ; CHECK-GI-NEXT: smov w12, v0.b[7]
2709 ; CHECK-GI-NEXT: mov v17.s[2], w9
2710 ; CHECK-GI-NEXT: smov w9, v0.b[10]
2711 ; CHECK-GI-NEXT: mov v7.s[2], w15
2712 ; CHECK-GI-NEXT: mov v19.s[1], w13
2713 ; CHECK-GI-NEXT: smov w13, v1.b[3]
2714 ; CHECK-GI-NEXT: mov v23.s[2], wzr
2715 ; CHECK-GI-NEXT: mov v18.s[2], w11
2716 ; CHECK-GI-NEXT: smov w11, v0.b[15]
2717 ; CHECK-GI-NEXT: mov v20.s[2], w10
2718 ; CHECK-GI-NEXT: ldrsb w10, [x0, #2]
2719 ; CHECK-GI-NEXT: mov v22.s[1], wzr
2720 ; CHECK-GI-NEXT: mov v17.s[3], w12
2721 ; CHECK-GI-NEXT: ldrsb w12, [x0, #7]
2722 ; CHECK-GI-NEXT: mov v7.s[3], w8
2723 ; CHECK-GI-NEXT: ldrsb w8, [x0, #3]
2724 ; CHECK-GI-NEXT: mov v19.s[2], w9
2725 ; CHECK-GI-NEXT: smov w9, v0.b[3]
2726 ; CHECK-GI-NEXT: mov v18.s[3], w11
2727 ; CHECK-GI-NEXT: smov w11, v0.b[11]
2728 ; CHECK-GI-NEXT: mov v21.s[2], w10
2729 ; CHECK-GI-NEXT: ldrsb w10, [x1, #3]
2730 ; CHECK-GI-NEXT: mov v20.s[3], w12
2731 ; CHECK-GI-NEXT: mov v22.s[2], wzr
2732 ; CHECK-GI-NEXT: mov v6.s[3], w13
2733 ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v17.4s
2734 ; CHECK-GI-NEXT: mov v23.s[3], wzr
2735 ; CHECK-GI-NEXT: mov v3.s[3], w10
2736 ; CHECK-GI-NEXT: mov v16.s[3], w9
2737 ; CHECK-GI-NEXT: mov v19.s[3], w11
2738 ; CHECK-GI-NEXT: mul v1.4s, v4.4s, v18.4s
2739 ; CHECK-GI-NEXT: mov v21.s[3], w8
2740 ; CHECK-GI-NEXT: mul v2.4s, v5.4s, v20.4s
2741 ; CHECK-GI-NEXT: mov v22.s[3], wzr
2742 ; CHECK-GI-NEXT: mla v0.4s, v6.4s, v16.4s
2743 ; CHECK-GI-NEXT: mla v1.4s, v7.4s, v19.4s
2744 ; CHECK-GI-NEXT: mla v2.4s, v3.4s, v21.4s
2745 ; CHECK-GI-NEXT: add v3.4s, v22.4s, v23.4s
2746 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
2747 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
2748 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
2749 ; CHECK-GI-NEXT: addv s0, v0.4s
2750 ; CHECK-GI-NEXT: fmov w8, s0
2751 ; CHECK-GI-NEXT: add w0, w8, w2
2752 ; CHECK-GI-NEXT: ret
2754 %0 = load <25 x i8>, ptr %a
2755 %1 = sext <25 x i8> %0 to <25 x i32>
2756 %2 = load <25 x i8>, ptr %b
2757 %3 = sext <25 x i8> %2 to <25 x i32>
2758 %4 = mul nsw <25 x i32> %3, %1
2759 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
2760 %op.extra = add nsw i32 %5, %sum
2764 define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
2765 ; CHECK-SD-LABEL: test_sdot_v25i8_double:
2766 ; CHECK-SD: // %bb.0: // %entry
2767 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2768 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
2769 ; CHECK-SD-NEXT: .cfi_offset w29, -16
2770 ; CHECK-SD-NEXT: ldr b0, [sp, #216]
2771 ; CHECK-SD-NEXT: add x8, sp, #224
2772 ; CHECK-SD-NEXT: ldr b1, [sp, #16]
2773 ; CHECK-SD-NEXT: ldr b2, [sp, #280]
2774 ; CHECK-SD-NEXT: add x9, sp, #240
2775 ; CHECK-SD-NEXT: ldr b4, [sp, #80]
2776 ; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
2777 ; CHECK-SD-NEXT: add x8, sp, #24
2778 ; CHECK-SD-NEXT: add x10, sp, #48
2779 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
2780 ; CHECK-SD-NEXT: add x8, sp, #232
2781 ; CHECK-SD-NEXT: add x11, sp, #96
2782 ; CHECK-SD-NEXT: ldr b5, [sp, #152]
2783 ; CHECK-SD-NEXT: add x12, sp, #168
2784 ; CHECK-SD-NEXT: ldr b6, [sp, #616]
2785 ; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
2786 ; CHECK-SD-NEXT: add x8, sp, #32
2787 ; CHECK-SD-NEXT: fmov s3, w0
2788 ; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8]
2789 ; CHECK-SD-NEXT: add x8, sp, #288
2790 ; CHECK-SD-NEXT: ldr b7, [sp, #416]
2791 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
2792 ; CHECK-SD-NEXT: add x8, sp, #40
2793 ; CHECK-SD-NEXT: ldr b22, [sp, #744]
2794 ; CHECK-SD-NEXT: ld1 { v0.b }[3], [x9]
2795 ; CHECK-SD-NEXT: add x9, sp, #248
2796 ; CHECK-SD-NEXT: mov v3.b[1], w1
2797 ; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
2798 ; CHECK-SD-NEXT: add x8, sp, #88
2799 ; CHECK-SD-NEXT: ldr b23, [sp, #544]
2800 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8]
2801 ; CHECK-SD-NEXT: add x8, sp, #256
2802 ; CHECK-SD-NEXT: ldr b19, [sp, #680]
2803 ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x9]
2804 ; CHECK-SD-NEXT: add x9, sp, #296
2805 ; CHECK-SD-NEXT: ldr b20, [sp, #480]
2806 ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x10]
2807 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
2808 ; CHECK-SD-NEXT: add x10, sp, #160
2809 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11]
2810 ; CHECK-SD-NEXT: add x11, sp, #304
2811 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x10]
2812 ; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8]
2813 ; CHECK-SD-NEXT: add x8, sp, #56
2814 ; CHECK-SD-NEXT: add x10, sp, #264
2815 ; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
2816 ; CHECK-SD-NEXT: add x8, sp, #64
2817 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x11]
2818 ; CHECK-SD-NEXT: add x9, sp, #272
2819 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x12]
2820 ; CHECK-SD-NEXT: add x11, sp, #72
2821 ; CHECK-SD-NEXT: ld1 { v0.b }[6], [x10]
2822 ; CHECK-SD-NEXT: add x10, sp, #312
2823 ; CHECK-SD-NEXT: mov v3.b[2], w2
2824 ; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8]
2825 ; CHECK-SD-NEXT: add x8, sp, #104
2826 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x10]
2827 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8]
2828 ; CHECK-SD-NEXT: add x8, sp, #112
2829 ; CHECK-SD-NEXT: add x10, sp, #128
2830 ; CHECK-SD-NEXT: ld1 { v0.b }[7], [x9]
2831 ; CHECK-SD-NEXT: add x9, sp, #320
2832 ; CHECK-SD-NEXT: ldr b21, [sp, #552]
2833 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9]
2834 ; CHECK-SD-NEXT: add x9, sp, #176
2835 ; CHECK-SD-NEXT: ld1 { v1.b }[7], [x11]
2836 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
2837 ; CHECK-SD-NEXT: add x8, sp, #624
2838 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x9]
2839 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x8]
2840 ; CHECK-SD-NEXT: add x8, sp, #120
2841 ; CHECK-SD-NEXT: add x9, sp, #328
2842 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9]
2843 ; CHECK-SD-NEXT: add x9, sp, #184
2844 ; CHECK-SD-NEXT: add x11, sp, #192
2845 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
2846 ; CHECK-SD-NEXT: add x8, sp, #632
2847 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x9]
2848 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
2849 ; CHECK-SD-NEXT: add x9, sp, #640
2850 ; CHECK-SD-NEXT: add x8, sp, #336
2851 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
2852 ; CHECK-SD-NEXT: add x8, sp, #656
2853 ; CHECK-SD-NEXT: smull v23.8h, v23.8b, v22.8b
2854 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
2855 ; CHECK-SD-NEXT: add x11, sp, #648
2856 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x10]
2857 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x9]
2858 ; CHECK-SD-NEXT: add x9, sp, #200
2859 ; CHECK-SD-NEXT: add x10, sp, #136
2860 ; CHECK-SD-NEXT: ldr b22, [sp, #352]
2861 ; CHECK-SD-NEXT: add x12, sp, #360
2862 ; CHECK-SD-NEXT: mov v3.b[3], w3
2863 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
2864 ; CHECK-SD-NEXT: add x9, sp, #208
2865 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x10]
2866 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x11]
2867 ; CHECK-SD-NEXT: add x11, sp, #424
2868 ; CHECK-SD-NEXT: add x10, sp, #488
2869 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x11]
2870 ; CHECK-SD-NEXT: add x11, sp, #560
2871 ; CHECK-SD-NEXT: ld1 { v20.b }[1], [x10]
2872 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x9]
2873 ; CHECK-SD-NEXT: add x9, sp, #440
2874 ; CHECK-SD-NEXT: ld1 { v21.b }[1], [x11]
2875 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8]
2876 ; CHECK-SD-NEXT: add x8, sp, #432
2877 ; CHECK-SD-NEXT: ld1 { v22.b }[1], [x12]
2878 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8]
2879 ; CHECK-SD-NEXT: add x11, sp, #496
2880 ; CHECK-SD-NEXT: add x12, sp, #568
2881 ; CHECK-SD-NEXT: add x13, sp, #368
2882 ; CHECK-SD-NEXT: ld1 { v20.b }[2], [x11]
2883 ; CHECK-SD-NEXT: ld1 { v21.b }[2], [x12]
2884 ; CHECK-SD-NEXT: ld1 { v22.b }[2], [x13]
2885 ; CHECK-SD-NEXT: add x10, sp, #448
2886 ; CHECK-SD-NEXT: mov v3.b[4], w4
2887 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
2888 ; CHECK-SD-NEXT: add x9, sp, #688
2889 ; CHECK-SD-NEXT: add x11, sp, #576
2890 ; CHECK-SD-NEXT: ld1 { v19.b }[1], [x9]
2891 ; CHECK-SD-NEXT: add x9, sp, #696
2892 ; CHECK-SD-NEXT: add x12, sp, #376
2893 ; CHECK-SD-NEXT: ld1 { v21.b }[3], [x11]
2894 ; CHECK-SD-NEXT: ld1 { v22.b }[3], [x12]
2895 ; CHECK-SD-NEXT: add x11, sp, #512
2896 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
2897 ; CHECK-SD-NEXT: add x10, sp, #504
2898 ; CHECK-SD-NEXT: add x12, sp, #584
2899 ; CHECK-SD-NEXT: ld1 { v19.b }[2], [x9]
2900 ; CHECK-SD-NEXT: add x9, sp, #704
2901 ; CHECK-SD-NEXT: ld1 { v20.b }[3], [x10]
2902 ; CHECK-SD-NEXT: add x13, sp, #384
2903 ; CHECK-SD-NEXT: mov v3.b[5], w5
2904 ; CHECK-SD-NEXT: ld1 { v21.b }[4], [x12]
2905 ; CHECK-SD-NEXT: ld1 { v22.b }[4], [x13]
2906 ; CHECK-SD-NEXT: add x10, sp, #456
2907 ; CHECK-SD-NEXT: ldr b16, [sp, #344]
2908 ; CHECK-SD-NEXT: ld1 { v19.b }[3], [x9]
2909 ; CHECK-SD-NEXT: add x9, sp, #712
2910 ; CHECK-SD-NEXT: ld1 { v20.b }[4], [x11]
2911 ; CHECK-SD-NEXT: ldr b17, [sp, #144]
2912 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x10]
2913 ; CHECK-SD-NEXT: add x10, sp, #520
2914 ; CHECK-SD-NEXT: add x11, sp, #592
2915 ; CHECK-SD-NEXT: add x12, sp, #392
2916 ; CHECK-SD-NEXT: mov v3.b[6], w6
2917 ; CHECK-SD-NEXT: ld1 { v19.b }[4], [x9]
2918 ; CHECK-SD-NEXT: add x9, sp, #720
2919 ; CHECK-SD-NEXT: ld1 { v20.b }[5], [x10]
2920 ; CHECK-SD-NEXT: ld1 { v21.b }[5], [x11]
2921 ; CHECK-SD-NEXT: ld1 { v22.b }[5], [x12]
2922 ; CHECK-SD-NEXT: smull v16.8h, v17.8b, v16.8b
2923 ; CHECK-SD-NEXT: add x8, sp, #664
2924 ; CHECK-SD-NEXT: add x10, sp, #464
2925 ; CHECK-SD-NEXT: add x11, sp, #528
2926 ; CHECK-SD-NEXT: ld1 { v19.b }[5], [x9]
2927 ; CHECK-SD-NEXT: add x9, sp, #728
2928 ; CHECK-SD-NEXT: add x12, sp, #600
2929 ; CHECK-SD-NEXT: add x13, sp, #400
2930 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8]
2931 ; CHECK-SD-NEXT: ld1 { v20.b }[6], [x11]
2932 ; CHECK-SD-NEXT: ld1 { v21.b }[6], [x12]
2933 ; CHECK-SD-NEXT: ld1 { v22.b }[6], [x13]
2934 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
2935 ; CHECK-SD-NEXT: ld1 { v19.b }[6], [x9]
2936 ; CHECK-SD-NEXT: add x9, sp, #736
2937 ; CHECK-SD-NEXT: mov v3.b[7], w7
2938 ; CHECK-SD-NEXT: sshll v18.4s, v16.4h, #0
2939 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
2940 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
2941 ; CHECK-SD-NEXT: add x8, sp, #672
2942 ; CHECK-SD-NEXT: add x10, sp, #472
2943 ; CHECK-SD-NEXT: add x11, sp, #608
2944 ; CHECK-SD-NEXT: ld1 { v19.b }[7], [x9]
2945 ; CHECK-SD-NEXT: add x9, sp, #536
2946 ; CHECK-SD-NEXT: add x12, sp, #408
2947 ; CHECK-SD-NEXT: ld1 { v20.b }[7], [x9]
2948 ; CHECK-SD-NEXT: ld1 { v21.b }[7], [x11]
2949 ; CHECK-SD-NEXT: ld1 { v22.b }[7], [x12]
2950 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8]
2951 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x10]
2952 ; CHECK-SD-NEXT: sshll v23.4s, v23.4h, #0
2953 ; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
2954 ; CHECK-SD-NEXT: smull v1.8h, v4.8b, v2.8b
2955 ; CHECK-SD-NEXT: smull v2.8h, v3.8b, v5.8b
2956 ; CHECK-SD-NEXT: smull v3.8h, v20.8b, v19.8b
2957 ; CHECK-SD-NEXT: smull v4.8h, v22.8b, v21.8b
2958 ; CHECK-SD-NEXT: mov v17.s[0], v18.s[0]
2959 ; CHECK-SD-NEXT: smull v5.8h, v7.8b, v6.8b
2960 ; CHECK-SD-NEXT: mov v16.s[0], v23.s[0]
2961 ; CHECK-SD-NEXT: saddl2 v6.4s, v2.8h, v1.8h
2962 ; CHECK-SD-NEXT: saddl v1.4s, v2.4h, v1.4h
2963 ; CHECK-SD-NEXT: saddl2 v2.4s, v4.8h, v3.8h
2964 ; CHECK-SD-NEXT: saddl v3.4s, v4.4h, v3.4h
2965 ; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v0.4h
2966 ; CHECK-SD-NEXT: saddw v7.4s, v16.4s, v5.4h
2967 ; CHECK-SD-NEXT: saddw2 v0.4s, v6.4s, v0.8h
2968 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v4.4s
2969 ; CHECK-SD-NEXT: saddw2 v2.4s, v2.4s, v5.8h
2970 ; CHECK-SD-NEXT: add v3.4s, v3.4s, v7.4s
2971 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
2972 ; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
2973 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
2974 ; CHECK-SD-NEXT: addv s0, v0.4s
2975 ; CHECK-SD-NEXT: fmov w0, s0
2976 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2977 ; CHECK-SD-NEXT: ret
2979 ; CHECK-GI-LABEL: test_sdot_v25i8_double:
2980 ; CHECK-GI: // %bb.0: // %entry
2981 ; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
2982 ; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
2983 ; CHECK-GI-NEXT: str x29, [sp, #24] // 8-byte Folded Spill
2984 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
2985 ; CHECK-GI-NEXT: .cfi_offset w29, -8
2986 ; CHECK-GI-NEXT: .cfi_offset b8, -16
2987 ; CHECK-GI-NEXT: .cfi_offset b9, -24
2988 ; CHECK-GI-NEXT: .cfi_offset b10, -32
2989 ; CHECK-GI-NEXT: ldr w8, [sp, #32]
2990 ; CHECK-GI-NEXT: sxtb w9, w0
2991 ; CHECK-GI-NEXT: sxtb w10, w4
2992 ; CHECK-GI-NEXT: sxtb w11, w5
2993 ; CHECK-GI-NEXT: sxtb w12, w3
2994 ; CHECK-GI-NEXT: sxtb w13, w7
2995 ; CHECK-GI-NEXT: mov v0.s[0], w9
2996 ; CHECK-GI-NEXT: sxtb w8, w8
2997 ; CHECK-GI-NEXT: mov v2.s[0], w10
2998 ; CHECK-GI-NEXT: ldr w9, [sp, #40]
2999 ; CHECK-GI-NEXT: sxtb w10, w1
3000 ; CHECK-GI-NEXT: ldr w14, [sp, #152]
3001 ; CHECK-GI-NEXT: mov v1.s[0], w8
3002 ; CHECK-GI-NEXT: ldr w8, [sp, #64]
3003 ; CHECK-GI-NEXT: mov v9.s[0], wzr
3004 ; CHECK-GI-NEXT: sxtb w9, w9
3005 ; CHECK-GI-NEXT: ldr x29, [sp, #24] // 8-byte Folded Reload
3006 ; CHECK-GI-NEXT: mov v0.s[1], w10
3007 ; CHECK-GI-NEXT: mov v2.s[1], w11
3008 ; CHECK-GI-NEXT: sxtb w8, w8
3009 ; CHECK-GI-NEXT: sxtb w10, w2
3010 ; CHECK-GI-NEXT: sxtb w11, w6
3011 ; CHECK-GI-NEXT: mov v1.s[1], w9
3012 ; CHECK-GI-NEXT: ldr w9, [sp, #48]
3013 ; CHECK-GI-NEXT: mov v3.s[0], w8
3014 ; CHECK-GI-NEXT: ldr w8, [sp, #72]
3015 ; CHECK-GI-NEXT: mov v9.s[1], wzr
3016 ; CHECK-GI-NEXT: sxtb w9, w9
3017 ; CHECK-GI-NEXT: mov v0.s[2], w10
3018 ; CHECK-GI-NEXT: ldr w10, [sp, #56]
3019 ; CHECK-GI-NEXT: sxtb w8, w8
3020 ; CHECK-GI-NEXT: mov v2.s[2], w11
3021 ; CHECK-GI-NEXT: ldr w11, [sp, #80]
3022 ; CHECK-GI-NEXT: mov v1.s[2], w9
3023 ; CHECK-GI-NEXT: ldr w9, [sp, #96]
3024 ; CHECK-GI-NEXT: sxtb w10, w10
3025 ; CHECK-GI-NEXT: mov v3.s[1], w8
3026 ; CHECK-GI-NEXT: sxtb w11, w11
3027 ; CHECK-GI-NEXT: ldr w8, [sp, #88]
3028 ; CHECK-GI-NEXT: mov v0.s[3], w12
3029 ; CHECK-GI-NEXT: ldr w12, [sp, #128]
3030 ; CHECK-GI-NEXT: sxtb w9, w9
3031 ; CHECK-GI-NEXT: mov v2.s[3], w13
3032 ; CHECK-GI-NEXT: ldr w13, [sp, #168]
3033 ; CHECK-GI-NEXT: sxtb w8, w8
3034 ; CHECK-GI-NEXT: mov v1.s[3], w10
3035 ; CHECK-GI-NEXT: ldr w10, [sp, #104]
3036 ; CHECK-GI-NEXT: sxtb w12, w12
3037 ; CHECK-GI-NEXT: mov v4.s[0], w9
3038 ; CHECK-GI-NEXT: mov v3.s[2], w11
3039 ; CHECK-GI-NEXT: sxtb w11, w13
3040 ; CHECK-GI-NEXT: ldr w13, [sp, #136]
3041 ; CHECK-GI-NEXT: sxtb w10, w10
3042 ; CHECK-GI-NEXT: mov v5.s[0], w12
3043 ; CHECK-GI-NEXT: ldr w9, [sp, #112]
3044 ; CHECK-GI-NEXT: ldr w12, [sp, #176]
3045 ; CHECK-GI-NEXT: mov v6.s[0], w11
3046 ; CHECK-GI-NEXT: sxtb w13, w13
3047 ; CHECK-GI-NEXT: ldr w11, [sp, #120]
3048 ; CHECK-GI-NEXT: mov v9.s[2], wzr
3049 ; CHECK-GI-NEXT: mov v4.s[1], w10
3050 ; CHECK-GI-NEXT: ldr w10, [sp, #200]
3051 ; CHECK-GI-NEXT: mov v3.s[3], w8
3052 ; CHECK-GI-NEXT: sxtb w8, w12
3053 ; CHECK-GI-NEXT: ldr w12, [sp, #144]
3054 ; CHECK-GI-NEXT: sxtb w9, w9
3055 ; CHECK-GI-NEXT: mov v5.s[1], w13
3056 ; CHECK-GI-NEXT: sxtb w10, w10
3057 ; CHECK-GI-NEXT: ldr w13, [sp, #184]
3058 ; CHECK-GI-NEXT: mov v6.s[1], w8
3059 ; CHECK-GI-NEXT: sxtb w12, w12
3060 ; CHECK-GI-NEXT: ldr w8, [sp, #160]
3061 ; CHECK-GI-NEXT: mov v4.s[2], w9
3062 ; CHECK-GI-NEXT: ldr w9, [sp, #208]
3063 ; CHECK-GI-NEXT: mov v7.s[0], w10
3064 ; CHECK-GI-NEXT: sxtb w13, w13
3065 ; CHECK-GI-NEXT: sxtb w10, w11
3066 ; CHECK-GI-NEXT: ldr w11, [sp, #192]
3067 ; CHECK-GI-NEXT: mov v5.s[2], w12
3068 ; CHECK-GI-NEXT: sxtb w9, w9
3069 ; CHECK-GI-NEXT: ldr w12, [sp, #232]
3070 ; CHECK-GI-NEXT: mov v6.s[2], w13
3071 ; CHECK-GI-NEXT: sxtb w13, w14
3072 ; CHECK-GI-NEXT: sxtb w11, w11
3073 ; CHECK-GI-NEXT: mov v4.s[3], w10
3074 ; CHECK-GI-NEXT: ldr w10, [sp, #216]
3075 ; CHECK-GI-NEXT: mov v7.s[1], w9
3076 ; CHECK-GI-NEXT: ldr w9, [sp, #264]
3077 ; CHECK-GI-NEXT: sxtb w12, w12
3078 ; CHECK-GI-NEXT: ldr w14, [sp, #368]
3079 ; CHECK-GI-NEXT: mov v5.s[3], w13
3080 ; CHECK-GI-NEXT: ldr w13, [sp, #296]
3081 ; CHECK-GI-NEXT: sxtb w10, w10
3082 ; CHECK-GI-NEXT: sxtb w9, w9
3083 ; CHECK-GI-NEXT: mov v6.s[3], w11
3084 ; CHECK-GI-NEXT: ldr w11, [sp, #240]
3085 ; CHECK-GI-NEXT: mov v16.s[0], w12
3086 ; CHECK-GI-NEXT: ldr w12, [sp, #224]
3087 ; CHECK-GI-NEXT: sxtb w13, w13
3088 ; CHECK-GI-NEXT: mov v7.s[2], w10
3089 ; CHECK-GI-NEXT: ldr w10, [sp, #272]
3090 ; CHECK-GI-NEXT: mov v18.s[0], w9
3091 ; CHECK-GI-NEXT: sxtb w11, w11
3092 ; CHECK-GI-NEXT: ldr w9, [sp, #304]
3093 ; CHECK-GI-NEXT: sxtb w12, w12
3094 ; CHECK-GI-NEXT: mov v17.s[0], w13
3095 ; CHECK-GI-NEXT: sxtb w10, w10
3096 ; CHECK-GI-NEXT: ldr w13, [sp, #248]
3097 ; CHECK-GI-NEXT: mov v16.s[1], w11
3098 ; CHECK-GI-NEXT: ldr w11, [sp, #328]
3099 ; CHECK-GI-NEXT: sxtb w9, w9
3100 ; CHECK-GI-NEXT: mov v7.s[3], w12
3101 ; CHECK-GI-NEXT: ldr w12, [sp, #280]
3102 ; CHECK-GI-NEXT: mov v18.s[1], w10
3103 ; CHECK-GI-NEXT: sxtb w13, w13
3104 ; CHECK-GI-NEXT: ldr w10, [sp, #312]
3105 ; CHECK-GI-NEXT: sxtb w11, w11
3106 ; CHECK-GI-NEXT: mov v17.s[1], w9
3107 ; CHECK-GI-NEXT: sxtb w12, w12
3108 ; CHECK-GI-NEXT: sxtb w8, w8
3109 ; CHECK-GI-NEXT: mov v16.s[2], w13
3110 ; CHECK-GI-NEXT: ldr w13, [sp, #336]
3111 ; CHECK-GI-NEXT: sxtb w10, w10
3112 ; CHECK-GI-NEXT: mov v19.s[0], w11
3113 ; CHECK-GI-NEXT: ldr w11, [sp, #288]
3114 ; CHECK-GI-NEXT: mov v18.s[2], w12
3115 ; CHECK-GI-NEXT: ldr w12, [sp, #320]
3116 ; CHECK-GI-NEXT: sxtb w13, w13
3117 ; CHECK-GI-NEXT: ldr w9, [sp, #256]
3118 ; CHECK-GI-NEXT: mov v17.s[2], w10
3119 ; CHECK-GI-NEXT: sxtb w11, w11
3120 ; CHECK-GI-NEXT: ldr w10, [sp, #344]
3121 ; CHECK-GI-NEXT: sxtb w12, w12
3122 ; CHECK-GI-NEXT: sxtb w9, w9
3123 ; CHECK-GI-NEXT: mul v2.4s, v2.4s, v7.4s
3124 ; CHECK-GI-NEXT: mov v19.s[1], w13
3125 ; CHECK-GI-NEXT: mov v18.s[3], w11
3126 ; CHECK-GI-NEXT: ldr w11, [sp, #360]
3127 ; CHECK-GI-NEXT: sxtb w10, w10
3128 ; CHECK-GI-NEXT: ldr w13, [sp, #400]
3129 ; CHECK-GI-NEXT: mov v16.s[3], w9
3130 ; CHECK-GI-NEXT: mov v17.s[3], w12
3131 ; CHECK-GI-NEXT: sxtb w12, w14
3132 ; CHECK-GI-NEXT: sxtb w11, w11
3133 ; CHECK-GI-NEXT: ldr w9, [sp, #352]
3134 ; CHECK-GI-NEXT: mov v9.s[3], wzr
3135 ; CHECK-GI-NEXT: mla v2.4s, v0.4s, v6.4s
3136 ; CHECK-GI-NEXT: mov v19.s[2], w10
3137 ; CHECK-GI-NEXT: ldr w10, [sp, #376]
3138 ; CHECK-GI-NEXT: mov v20.s[0], w12
3139 ; CHECK-GI-NEXT: ldr w12, [sp, #432]
3140 ; CHECK-GI-NEXT: mul w8, w8, w11
3141 ; CHECK-GI-NEXT: sxtb w11, w13
3142 ; CHECK-GI-NEXT: sxtb w10, w10
3143 ; CHECK-GI-NEXT: ldr w13, [sp, #384]
3144 ; CHECK-GI-NEXT: sxtb w9, w9
3145 ; CHECK-GI-NEXT: mov v23.s[0], w11
3146 ; CHECK-GI-NEXT: ldr w11, [sp, #408]
3147 ; CHECK-GI-NEXT: sxtb w12, w12
3148 ; CHECK-GI-NEXT: mov v20.s[1], w10
3149 ; CHECK-GI-NEXT: ldr w10, [sp, #440]
3150 ; CHECK-GI-NEXT: mov v21.s[0], w8
3151 ; CHECK-GI-NEXT: sxtb w11, w11
3152 ; CHECK-GI-NEXT: mov v22.s[0], w12
3153 ; CHECK-GI-NEXT: ldr w12, [sp, #464]
3154 ; CHECK-GI-NEXT: sxtb w8, w13
3155 ; CHECK-GI-NEXT: ldr w13, [sp, #416]
3156 ; CHECK-GI-NEXT: sxtb w10, w10
3157 ; CHECK-GI-NEXT: mov v23.s[1], w11
3158 ; CHECK-GI-NEXT: sxtb w12, w12
3159 ; CHECK-GI-NEXT: mov v19.s[3], w9
3160 ; CHECK-GI-NEXT: ldr w9, [sp, #392]
3161 ; CHECK-GI-NEXT: mov v20.s[2], w8
3162 ; CHECK-GI-NEXT: sxtb w13, w13
3163 ; CHECK-GI-NEXT: mov v22.s[1], w10
3164 ; CHECK-GI-NEXT: ldr w10, [sp, #472]
3165 ; CHECK-GI-NEXT: mov v24.s[0], w12
3166 ; CHECK-GI-NEXT: ldr w11, [sp, #424]
3167 ; CHECK-GI-NEXT: ldr w8, [sp, #448]
3168 ; CHECK-GI-NEXT: sxtb w9, w9
3169 ; CHECK-GI-NEXT: mov v23.s[2], w13
3170 ; CHECK-GI-NEXT: ldr w13, [sp, #496]
3171 ; CHECK-GI-NEXT: sxtb w10, w10
3172 ; CHECK-GI-NEXT: sxtb w8, w8
3173 ; CHECK-GI-NEXT: mov v20.s[3], w9
3174 ; CHECK-GI-NEXT: ldr w9, [sp, #480]
3175 ; CHECK-GI-NEXT: sxtb w11, w11
3176 ; CHECK-GI-NEXT: sxtb w13, w13
3177 ; CHECK-GI-NEXT: mov v24.s[1], w10
3178 ; CHECK-GI-NEXT: ldr w10, [sp, #528]
3179 ; CHECK-GI-NEXT: ldr w12, [sp, #456]
3180 ; CHECK-GI-NEXT: mov v22.s[2], w8
3181 ; CHECK-GI-NEXT: mov v23.s[3], w11
3182 ; CHECK-GI-NEXT: ldr w11, [sp, #504]
3183 ; CHECK-GI-NEXT: sxtb w9, w9
3184 ; CHECK-GI-NEXT: mov v25.s[0], w13
3185 ; CHECK-GI-NEXT: sxtb w10, w10
3186 ; CHECK-GI-NEXT: ldr w8, [sp, #488]
3187 ; CHECK-GI-NEXT: sxtb w12, w12
3188 ; CHECK-GI-NEXT: sxtb w11, w11
3189 ; CHECK-GI-NEXT: mov v24.s[2], w9
3190 ; CHECK-GI-NEXT: ldr w9, [sp, #536]
3191 ; CHECK-GI-NEXT: mov v26.s[0], w10
3192 ; CHECK-GI-NEXT: ldr w13, [sp, #568]
3193 ; CHECK-GI-NEXT: mov v22.s[3], w12
3194 ; CHECK-GI-NEXT: ldr w12, [sp, #512]
3195 ; CHECK-GI-NEXT: sxtb w8, w8
3196 ; CHECK-GI-NEXT: mov v25.s[1], w11
3197 ; CHECK-GI-NEXT: sxtb w9, w9
3198 ; CHECK-GI-NEXT: sxtb w10, w13
3199 ; CHECK-GI-NEXT: sxtb w12, w12
3200 ; CHECK-GI-NEXT: mov v24.s[3], w8
3201 ; CHECK-GI-NEXT: ldr w8, [sp, #544]
3202 ; CHECK-GI-NEXT: mov v26.s[1], w9
3203 ; CHECK-GI-NEXT: ldr w13, [sp, #520]
3204 ; CHECK-GI-NEXT: ldr w11, [sp, #576]
3205 ; CHECK-GI-NEXT: mov v27.s[0], w10
3206 ; CHECK-GI-NEXT: ldr w10, [sp, #600]
3207 ; CHECK-GI-NEXT: sxtb w8, w8
3208 ; CHECK-GI-NEXT: mov v25.s[2], w12
3209 ; CHECK-GI-NEXT: ldr w12, [sp, #584]
3210 ; CHECK-GI-NEXT: sxtb w9, w11
3211 ; CHECK-GI-NEXT: ldr w11, [sp, #552]
3212 ; CHECK-GI-NEXT: sxtb w10, w10
3213 ; CHECK-GI-NEXT: sxtb w13, w13
3214 ; CHECK-GI-NEXT: mov v26.s[2], w8
3215 ; CHECK-GI-NEXT: sxtb w8, w12
3216 ; CHECK-GI-NEXT: ldr w12, [sp, #664]
3217 ; CHECK-GI-NEXT: mov v27.s[1], w9
3218 ; CHECK-GI-NEXT: ldr w9, [sp, #608]
3219 ; CHECK-GI-NEXT: mov v28.s[0], w10
3220 ; CHECK-GI-NEXT: mov v25.s[3], w13
3221 ; CHECK-GI-NEXT: ldr w13, [sp, #592]
3222 ; CHECK-GI-NEXT: sxtb w11, w11
3223 ; CHECK-GI-NEXT: sxtb w12, w12
3224 ; CHECK-GI-NEXT: sxtb w9, w9
3225 ; CHECK-GI-NEXT: ldr w10, [sp, #560]
3226 ; CHECK-GI-NEXT: mov v26.s[3], w11
3227 ; CHECK-GI-NEXT: sxtb w11, w13
3228 ; CHECK-GI-NEXT: ldr w13, [sp, #672]
3229 ; CHECK-GI-NEXT: mov v30.s[0], w12
3230 ; CHECK-GI-NEXT: mov v27.s[2], w8
3231 ; CHECK-GI-NEXT: ldr w8, [sp, #616]
3232 ; CHECK-GI-NEXT: mov v28.s[1], w9
3233 ; CHECK-GI-NEXT: ldr w9, [sp, #632]
3234 ; CHECK-GI-NEXT: ldr w12, [sp, #728]
3235 ; CHECK-GI-NEXT: sxtb w13, w13
3236 ; CHECK-GI-NEXT: sxtb w8, w8
3237 ; CHECK-GI-NEXT: sxtb w10, w10
3238 ; CHECK-GI-NEXT: sxtb w9, w9
3239 ; CHECK-GI-NEXT: sxtb w12, w12
3240 ; CHECK-GI-NEXT: mov v21.s[1], wzr
3241 ; CHECK-GI-NEXT: mov v30.s[1], w13
3242 ; CHECK-GI-NEXT: ldr w13, [sp, #760]
3243 ; CHECK-GI-NEXT: mov v27.s[3], w11
3244 ; CHECK-GI-NEXT: mov v28.s[2], w8
3245 ; CHECK-GI-NEXT: ldr w8, [sp, #696]
3246 ; CHECK-GI-NEXT: mov v29.s[0], w9
3247 ; CHECK-GI-NEXT: ldr w9, [sp, #736]
3248 ; CHECK-GI-NEXT: mov v8.s[0], w12
3249 ; CHECK-GI-NEXT: sxtb w13, w13
3250 ; CHECK-GI-NEXT: ldr w11, [sp, #640]
3251 ; CHECK-GI-NEXT: sxtb w8, w8
3252 ; CHECK-GI-NEXT: ldr w12, [sp, #680]
3253 ; CHECK-GI-NEXT: sxtb w9, w9
3254 ; CHECK-GI-NEXT: mul w10, w10, w13
3255 ; CHECK-GI-NEXT: mov v21.s[2], wzr
3256 ; CHECK-GI-NEXT: sxtb w11, w11
3257 ; CHECK-GI-NEXT: mov v31.s[0], w8
3258 ; CHECK-GI-NEXT: ldr w8, [sp, #704]
3259 ; CHECK-GI-NEXT: mov v8.s[1], w9
3260 ; CHECK-GI-NEXT: ldr w9, [sp, #744]
3261 ; CHECK-GI-NEXT: mul v3.4s, v3.4s, v18.4s
3262 ; CHECK-GI-NEXT: mov v10.s[0], w10
3263 ; CHECK-GI-NEXT: mov v29.s[1], w11
3264 ; CHECK-GI-NEXT: sxtb w11, w12
3265 ; CHECK-GI-NEXT: sxtb w8, w8
3266 ; CHECK-GI-NEXT: sxtb w9, w9
3267 ; CHECK-GI-NEXT: ldr w12, [sp, #624]
3268 ; CHECK-GI-NEXT: mov v30.s[2], w11
3269 ; CHECK-GI-NEXT: ldr w11, [sp, #648]
3270 ; CHECK-GI-NEXT: ldr w10, [sp, #688]
3271 ; CHECK-GI-NEXT: mov v31.s[1], w8
3272 ; CHECK-GI-NEXT: ldr w8, [sp, #712]
3273 ; CHECK-GI-NEXT: mov v8.s[2], w9
3274 ; CHECK-GI-NEXT: ldr w9, [sp, #752]
3275 ; CHECK-GI-NEXT: mov v10.s[1], wzr
3276 ; CHECK-GI-NEXT: sxtb w12, w12
3277 ; CHECK-GI-NEXT: sxtb w11, w11
3278 ; CHECK-GI-NEXT: sxtb w10, w10
3279 ; CHECK-GI-NEXT: sxtb w8, w8
3280 ; CHECK-GI-NEXT: sxtb w9, w9
3281 ; CHECK-GI-NEXT: mov v28.s[3], w12
3282 ; CHECK-GI-NEXT: mul v5.4s, v5.4s, v19.4s
3283 ; CHECK-GI-NEXT: mov v29.s[2], w11
3284 ; CHECK-GI-NEXT: mov v30.s[3], w10
3285 ; CHECK-GI-NEXT: mov v31.s[2], w8
3286 ; CHECK-GI-NEXT: ldr w8, [sp, #656]
3287 ; CHECK-GI-NEXT: mov v8.s[3], w9
3288 ; CHECK-GI-NEXT: ldr w9, [sp, #720]
3289 ; CHECK-GI-NEXT: mov v10.s[2], wzr
3290 ; CHECK-GI-NEXT: mov v21.s[3], wzr
3291 ; CHECK-GI-NEXT: mla v3.4s, v1.4s, v16.4s
3292 ; CHECK-GI-NEXT: sxtb w8, w8
3293 ; CHECK-GI-NEXT: sxtb w9, w9
3294 ; CHECK-GI-NEXT: mul v7.4s, v23.4s, v28.4s
3295 ; CHECK-GI-NEXT: mul v18.4s, v24.4s, v30.4s
3296 ; CHECK-GI-NEXT: mla v5.4s, v4.4s, v17.4s
3297 ; CHECK-GI-NEXT: mul v19.4s, v26.4s, v8.4s
3298 ; CHECK-GI-NEXT: mov v29.s[3], w8
3299 ; CHECK-GI-NEXT: mov v31.s[3], w9
3300 ; CHECK-GI-NEXT: mov v10.s[3], wzr
3301 ; CHECK-GI-NEXT: add v0.4s, v21.4s, v9.4s
3302 ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
3303 ; CHECK-GI-NEXT: mla v7.4s, v20.4s, v27.4s
3304 ; CHECK-GI-NEXT: mla v18.4s, v22.4s, v29.4s
3305 ; CHECK-GI-NEXT: mla v19.4s, v25.4s, v31.4s
3306 ; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s
3307 ; CHECK-GI-NEXT: add v1.4s, v10.4s, v9.4s
3308 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
3309 ; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
3310 ; CHECK-GI-NEXT: add v3.4s, v7.4s, v18.4s
3311 ; CHECK-GI-NEXT: add v1.4s, v19.4s, v1.4s
3312 ; CHECK-GI-NEXT: addv s0, v0.4s
3313 ; CHECK-GI-NEXT: add v1.4s, v3.4s, v1.4s
3314 ; CHECK-GI-NEXT: fmov w8, s0
3315 ; CHECK-GI-NEXT: addv s1, v1.4s
3316 ; CHECK-GI-NEXT: fmov w9, s1
3317 ; CHECK-GI-NEXT: add w0, w8, w9
3318 ; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
3319 ; CHECK-GI-NEXT: ret
3321 %az = sext <25 x i8> %a to <25 x i32>
3322 %bz = sext <25 x i8> %b to <25 x i32>
3323 %m1 = mul nuw nsw <25 x i32> %az, %bz
3324 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1)
3325 %cz = sext <25 x i8> %c to <25 x i32>
3326 %dz = sext <25 x i8> %d to <25 x i32>
3327 %m2 = mul nuw nsw <25 x i32> %cz, %dz
3328 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2)
3329 %x = add i32 %r1, %r2
3333 define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
3334 ; CHECK-SD-LABEL: test_sdot_v25i8_double_nomla:
3335 ; CHECK-SD: // %bb.0: // %entry
3336 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
3337 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
3338 ; CHECK-SD-NEXT: .cfi_offset w29, -16
3339 ; CHECK-SD-NEXT: fmov s0, w0
3340 ; CHECK-SD-NEXT: ldr b1, [sp, #80]
3341 ; CHECK-SD-NEXT: add x10, sp, #88
3342 ; CHECK-SD-NEXT: ldr b2, [sp, #16]
3343 ; CHECK-SD-NEXT: add x9, sp, #96
3344 ; CHECK-SD-NEXT: ldr b3, [sp, #480]
3345 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x10]
3346 ; CHECK-SD-NEXT: add x10, sp, #24
3347 ; CHECK-SD-NEXT: ldr b4, [sp, #352]
3348 ; CHECK-SD-NEXT: mov v0.b[1], w1
3349 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x10]
3350 ; CHECK-SD-NEXT: add x11, sp, #488
3351 ; CHECK-SD-NEXT: add x10, sp, #360
3352 ; CHECK-SD-NEXT: ldr b5, [sp, #416]
3353 ; CHECK-SD-NEXT: add x8, sp, #104
3354 ; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
3355 ; CHECK-SD-NEXT: add x9, sp, #32
3356 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x11]
3357 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
3358 ; CHECK-SD-NEXT: add x11, sp, #424
3359 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x10]
3360 ; CHECK-SD-NEXT: mov v0.b[2], w2
3361 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x11]
3362 ; CHECK-SD-NEXT: add x9, sp, #368
3363 ; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
3364 ; CHECK-SD-NEXT: add x8, sp, #40
3365 ; CHECK-SD-NEXT: add x12, sp, #496
3366 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
3367 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x9]
3368 ; CHECK-SD-NEXT: add x8, sp, #432
3369 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x12]
3370 ; CHECK-SD-NEXT: add x13, sp, #48
3371 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
3372 ; CHECK-SD-NEXT: mov v0.b[3], w3
3373 ; CHECK-SD-NEXT: add x10, sp, #112
3374 ; CHECK-SD-NEXT: add x8, sp, #504
3375 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x13]
3376 ; CHECK-SD-NEXT: add x13, sp, #376
3377 ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x10]
3378 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x13]
3379 ; CHECK-SD-NEXT: add x13, sp, #440
3380 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8]
3381 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x13]
3382 ; CHECK-SD-NEXT: add x11, sp, #120
3383 ; CHECK-SD-NEXT: add x8, sp, #56
3384 ; CHECK-SD-NEXT: mov v0.b[4], w4
3385 ; CHECK-SD-NEXT: add x13, sp, #512
3386 ; CHECK-SD-NEXT: ld1 { v1.b }[5], [x11]
3387 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
3388 ; CHECK-SD-NEXT: add x8, sp, #384
3389 ; CHECK-SD-NEXT: add x11, sp, #448
3390 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x13]
3391 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
3392 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x11]
3393 ; CHECK-SD-NEXT: add x12, sp, #128
3394 ; CHECK-SD-NEXT: add x10, sp, #64
3395 ; CHECK-SD-NEXT: add x8, sp, #520
3396 ; CHECK-SD-NEXT: mov v0.b[5], w5
3397 ; CHECK-SD-NEXT: ld1 { v1.b }[6], [x12]
3398 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x10]
3399 ; CHECK-SD-NEXT: add x10, sp, #392
3400 ; CHECK-SD-NEXT: add x11, sp, #456
3401 ; CHECK-SD-NEXT: ldr b6, [sp, #144]
3402 ; CHECK-SD-NEXT: ldr b7, [sp, #544]
3403 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8]
3404 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x10]
3405 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
3406 ; CHECK-SD-NEXT: add x9, sp, #136
3407 ; CHECK-SD-NEXT: sshll v6.8h, v6.8b, #0
3408 ; CHECK-SD-NEXT: mov v0.b[6], w6
3409 ; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
3410 ; CHECK-SD-NEXT: add x8, sp, #528
3411 ; CHECK-SD-NEXT: add x9, sp, #400
3412 ; CHECK-SD-NEXT: add x10, sp, #464
3413 ; CHECK-SD-NEXT: sshll v7.8h, v7.8b, #0
3414 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8]
3415 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x9]
3416 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x10]
3417 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
3418 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
3419 ; CHECK-SD-NEXT: add x14, sp, #72
3420 ; CHECK-SD-NEXT: mov v0.b[7], w7
3421 ; CHECK-SD-NEXT: sshll v6.4s, v6.4h, #0
3422 ; CHECK-SD-NEXT: add x8, sp, #536
3423 ; CHECK-SD-NEXT: add x9, sp, #408
3424 ; CHECK-SD-NEXT: add x10, sp, #472
3425 ; CHECK-SD-NEXT: sshll v7.4s, v7.4h, #0
3426 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x14]
3427 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8]
3428 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x9]
3429 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x10]
3430 ; CHECK-SD-NEXT: mov v16.s[0], v6.s[0]
3431 ; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
3432 ; CHECK-SD-NEXT: mov v17.s[0], v7.s[0]
3433 ; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
3434 ; CHECK-SD-NEXT: sshll v2.8h, v2.8b, #0
3435 ; CHECK-SD-NEXT: sshll v3.8h, v3.8b, #0
3436 ; CHECK-SD-NEXT: sshll v4.8h, v4.8b, #0
3437 ; CHECK-SD-NEXT: sshll v5.8h, v5.8b, #0
3438 ; CHECK-SD-NEXT: saddl v7.4s, v0.4h, v1.4h
3439 ; CHECK-SD-NEXT: saddl2 v0.4s, v0.8h, v1.8h
3440 ; CHECK-SD-NEXT: saddw v6.4s, v16.4s, v2.4h
3441 ; CHECK-SD-NEXT: saddl v1.4s, v4.4h, v3.4h
3442 ; CHECK-SD-NEXT: saddl2 v3.4s, v4.8h, v3.8h
3443 ; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v5.4h
3444 ; CHECK-SD-NEXT: saddw2 v0.4s, v0.4s, v2.8h
3445 ; CHECK-SD-NEXT: add v6.4s, v7.4s, v6.4s
3446 ; CHECK-SD-NEXT: saddw2 v2.4s, v3.4s, v5.8h
3447 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v4.4s
3448 ; CHECK-SD-NEXT: add v0.4s, v6.4s, v0.4s
3449 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
3450 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
3451 ; CHECK-SD-NEXT: addv s0, v0.4s
3452 ; CHECK-SD-NEXT: fmov w0, s0
3453 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
3454 ; CHECK-SD-NEXT: ret
3456 ; CHECK-GI-LABEL: test_sdot_v25i8_double_nomla:
3457 ; CHECK-GI: // %bb.0: // %entry
3458 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
3459 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
3460 ; CHECK-GI-NEXT: .cfi_offset w29, -16
3461 ; CHECK-GI-NEXT: sxtb w8, w0
3462 ; CHECK-GI-NEXT: sxtb w9, w4
3463 ; CHECK-GI-NEXT: ldr w10, [sp, #48]
3464 ; CHECK-GI-NEXT: sxtb w11, w5
3465 ; CHECK-GI-NEXT: sxtb w12, w6
3466 ; CHECK-GI-NEXT: sxtb w13, w7
3467 ; CHECK-GI-NEXT: mov v0.s[0], w8
3468 ; CHECK-GI-NEXT: ldr w8, [sp, #16]
3469 ; CHECK-GI-NEXT: mov v1.s[0], w9
3470 ; CHECK-GI-NEXT: sxtb w9, w1
3471 ; CHECK-GI-NEXT: ldr w14, [sp, #104]
3472 ; CHECK-GI-NEXT: mov v20.s[0], wzr
3473 ; CHECK-GI-NEXT: sxtb w8, w8
3474 ; CHECK-GI-NEXT: mov v0.s[1], w9
3475 ; CHECK-GI-NEXT: ldr w9, [sp, #80]
3476 ; CHECK-GI-NEXT: mov v2.s[0], w8
3477 ; CHECK-GI-NEXT: sxtb w8, w10
3478 ; CHECK-GI-NEXT: mov v1.s[1], w11
3479 ; CHECK-GI-NEXT: ldr w10, [sp, #24]
3480 ; CHECK-GI-NEXT: sxtb w11, w2
3481 ; CHECK-GI-NEXT: sxtb w9, w9
3482 ; CHECK-GI-NEXT: mov v20.s[1], wzr
3483 ; CHECK-GI-NEXT: mov v3.s[0], w8
3484 ; CHECK-GI-NEXT: ldr w8, [sp, #56]
3485 ; CHECK-GI-NEXT: mov v4.s[0], w9
3486 ; CHECK-GI-NEXT: sxtb w9, w10
3487 ; CHECK-GI-NEXT: mov v0.s[2], w11
3488 ; CHECK-GI-NEXT: sxtb w8, w8
3489 ; CHECK-GI-NEXT: ldr w10, [sp, #32]
3490 ; CHECK-GI-NEXT: mov v1.s[2], w12
3491 ; CHECK-GI-NEXT: sxtb w12, w3
3492 ; CHECK-GI-NEXT: mov v2.s[1], w9
3493 ; CHECK-GI-NEXT: ldr w9, [sp, #112]
3494 ; CHECK-GI-NEXT: mov v3.s[1], w8
3495 ; CHECK-GI-NEXT: ldr w8, [sp, #88]
3496 ; CHECK-GI-NEXT: sxtb w10, w10
3497 ; CHECK-GI-NEXT: mov v0.s[3], w12
3498 ; CHECK-GI-NEXT: ldr w12, [sp, #64]
3499 ; CHECK-GI-NEXT: sxtb w9, w9
3500 ; CHECK-GI-NEXT: sxtb w8, w8
3501 ; CHECK-GI-NEXT: ldr w11, [sp, #40]
3502 ; CHECK-GI-NEXT: mov v1.s[3], w13
3503 ; CHECK-GI-NEXT: mov v2.s[2], w10
3504 ; CHECK-GI-NEXT: ldr w10, [sp, #120]
3505 ; CHECK-GI-NEXT: sxtb w12, w12
3506 ; CHECK-GI-NEXT: mov v5.s[0], w9
3507 ; CHECK-GI-NEXT: ldr w9, [sp, #96]
3508 ; CHECK-GI-NEXT: mov v4.s[1], w8
3509 ; CHECK-GI-NEXT: ldr w13, [sp, #72]
3510 ; CHECK-GI-NEXT: sxtb w8, w10
3511 ; CHECK-GI-NEXT: mov v3.s[2], w12
3512 ; CHECK-GI-NEXT: ldr w10, [sp, #352]
3513 ; CHECK-GI-NEXT: sxtb w9, w9
3514 ; CHECK-GI-NEXT: sxtb w11, w11
3515 ; CHECK-GI-NEXT: sxtb w12, w13
3516 ; CHECK-GI-NEXT: sxtb w13, w14
3517 ; CHECK-GI-NEXT: mov v20.s[2], wzr
3518 ; CHECK-GI-NEXT: mov v4.s[2], w9
3519 ; CHECK-GI-NEXT: sxtb w9, w10
3520 ; CHECK-GI-NEXT: mov v2.s[3], w11
3521 ; CHECK-GI-NEXT: ldr w11, [sp, #128]
3522 ; CHECK-GI-NEXT: mov v5.s[1], w8
3523 ; CHECK-GI-NEXT: ldr w10, [sp, #384]
3524 ; CHECK-GI-NEXT: mov v3.s[3], w12
3525 ; CHECK-GI-NEXT: ldr w12, [sp, #360]
3526 ; CHECK-GI-NEXT: mov v6.s[0], w9
3527 ; CHECK-GI-NEXT: sxtb w11, w11
3528 ; CHECK-GI-NEXT: ldr w9, [sp, #144]
3529 ; CHECK-GI-NEXT: sxtb w10, w10
3530 ; CHECK-GI-NEXT: sxtb w12, w12
3531 ; CHECK-GI-NEXT: mov v4.s[3], w13
3532 ; CHECK-GI-NEXT: ldr w13, [sp, #392]
3533 ; CHECK-GI-NEXT: mov v5.s[2], w11
3534 ; CHECK-GI-NEXT: ldr w11, [sp, #368]
3535 ; CHECK-GI-NEXT: mov v7.s[0], w10
3536 ; CHECK-GI-NEXT: sxtb w9, w9
3537 ; CHECK-GI-NEXT: mov v6.s[1], w12
3538 ; CHECK-GI-NEXT: ldr w10, [sp, #416]
3539 ; CHECK-GI-NEXT: sxtb w12, w13
3540 ; CHECK-GI-NEXT: ldr w13, [sp, #448]
3541 ; CHECK-GI-NEXT: ldr w8, [sp, #136]
3542 ; CHECK-GI-NEXT: mov v16.s[0], w9
3543 ; CHECK-GI-NEXT: sxtb w9, w11
3544 ; CHECK-GI-NEXT: sxtb w10, w10
3545 ; CHECK-GI-NEXT: ldr w11, [sp, #400]
3546 ; CHECK-GI-NEXT: mov v7.s[1], w12
3547 ; CHECK-GI-NEXT: ldr w12, [sp, #480]
3548 ; CHECK-GI-NEXT: mov v6.s[2], w9
3549 ; CHECK-GI-NEXT: ldr w9, [sp, #424]
3550 ; CHECK-GI-NEXT: mov v17.s[0], w10
3551 ; CHECK-GI-NEXT: sxtb w13, w13
3552 ; CHECK-GI-NEXT: sxtb w11, w11
3553 ; CHECK-GI-NEXT: sxtb w12, w12
3554 ; CHECK-GI-NEXT: sxtb w9, w9
3555 ; CHECK-GI-NEXT: ldr w10, [sp, #456]
3556 ; CHECK-GI-NEXT: mov v16.s[1], wzr
3557 ; CHECK-GI-NEXT: mov v18.s[0], w13
3558 ; CHECK-GI-NEXT: ldr w13, [sp, #488]
3559 ; CHECK-GI-NEXT: mov v7.s[2], w11
3560 ; CHECK-GI-NEXT: ldr w11, [sp, #512]
3561 ; CHECK-GI-NEXT: mov v19.s[0], w12
3562 ; CHECK-GI-NEXT: mov v17.s[1], w9
3563 ; CHECK-GI-NEXT: ldr w9, [sp, #544]
3564 ; CHECK-GI-NEXT: sxtb w10, w10
3565 ; CHECK-GI-NEXT: sxtb w13, w13
3566 ; CHECK-GI-NEXT: sxtb w11, w11
3567 ; CHECK-GI-NEXT: sxtb w14, w8
3568 ; CHECK-GI-NEXT: ldr w8, [sp, #376]
3569 ; CHECK-GI-NEXT: sxtb w9, w9
3570 ; CHECK-GI-NEXT: mov v18.s[1], w10
3571 ; CHECK-GI-NEXT: ldr w10, [sp, #432]
3572 ; CHECK-GI-NEXT: mov v19.s[1], w13
3573 ; CHECK-GI-NEXT: ldr w13, [sp, #520]
3574 ; CHECK-GI-NEXT: mov v21.s[0], w11
3575 ; CHECK-GI-NEXT: mov v22.s[0], w9
3576 ; CHECK-GI-NEXT: sxtb w10, w10
3577 ; CHECK-GI-NEXT: ldr w11, [sp, #464]
3578 ; CHECK-GI-NEXT: sxtb w13, w13
3579 ; CHECK-GI-NEXT: ldr w9, [sp, #496]
3580 ; CHECK-GI-NEXT: ldr w12, [sp, #408]
3581 ; CHECK-GI-NEXT: mov v17.s[2], w10
3582 ; CHECK-GI-NEXT: ldr w10, [sp, #528]
3583 ; CHECK-GI-NEXT: sxtb w11, w11
3584 ; CHECK-GI-NEXT: mov v21.s[1], w13
3585 ; CHECK-GI-NEXT: sxtb w9, w9
3586 ; CHECK-GI-NEXT: ldr w13, [sp, #440]
3587 ; CHECK-GI-NEXT: mov v22.s[1], wzr
3588 ; CHECK-GI-NEXT: sxtb w10, w10
3589 ; CHECK-GI-NEXT: mov v18.s[2], w11
3590 ; CHECK-GI-NEXT: ldr w11, [sp, #472]
3591 ; CHECK-GI-NEXT: mov v19.s[2], w9
3592 ; CHECK-GI-NEXT: ldr w9, [sp, #504]
3593 ; CHECK-GI-NEXT: mov v16.s[2], wzr
3594 ; CHECK-GI-NEXT: sxtb w8, w8
3595 ; CHECK-GI-NEXT: sxtb w12, w12
3596 ; CHECK-GI-NEXT: mov v21.s[2], w10
3597 ; CHECK-GI-NEXT: ldr w10, [sp, #536]
3598 ; CHECK-GI-NEXT: sxtb w13, w13
3599 ; CHECK-GI-NEXT: mov v22.s[2], wzr
3600 ; CHECK-GI-NEXT: mov v6.s[3], w8
3601 ; CHECK-GI-NEXT: sxtb w8, w11
3602 ; CHECK-GI-NEXT: sxtb w9, w9
3603 ; CHECK-GI-NEXT: sxtb w10, w10
3604 ; CHECK-GI-NEXT: mov v5.s[3], w14
3605 ; CHECK-GI-NEXT: mov v7.s[3], w12
3606 ; CHECK-GI-NEXT: mov v17.s[3], w13
3607 ; CHECK-GI-NEXT: mov v18.s[3], w8
3608 ; CHECK-GI-NEXT: mov v16.s[3], wzr
3609 ; CHECK-GI-NEXT: mov v20.s[3], wzr
3610 ; CHECK-GI-NEXT: mov v19.s[3], w9
3611 ; CHECK-GI-NEXT: mov v21.s[3], w10
3612 ; CHECK-GI-NEXT: mov v22.s[3], wzr
3613 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
3614 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
3615 ; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
3616 ; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
3617 ; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s
3618 ; CHECK-GI-NEXT: add v6.4s, v16.4s, v20.4s
3619 ; CHECK-GI-NEXT: add v5.4s, v19.4s, v21.4s
3620 ; CHECK-GI-NEXT: add v7.4s, v22.4s, v20.4s
3621 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
3622 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v6.4s
3623 ; CHECK-GI-NEXT: add v2.4s, v3.4s, v4.4s
3624 ; CHECK-GI-NEXT: add v3.4s, v5.4s, v7.4s
3625 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
3626 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
3627 ; CHECK-GI-NEXT: addv s0, v0.4s
3628 ; CHECK-GI-NEXT: addv s1, v1.4s
3629 ; CHECK-GI-NEXT: fmov w8, s0
3630 ; CHECK-GI-NEXT: fmov w9, s1
3631 ; CHECK-GI-NEXT: add w0, w8, w9
3632 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
3633 ; CHECK-GI-NEXT: ret
3635 %az = sext <25 x i8> %a to <25 x i32>
3636 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az)
3637 %cz = sext <25 x i8> %c to <25 x i32>
3638 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz)
3639 %x = add i32 %r1, %r2
3643 define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3644 ; CHECK-SD-LABEL: test_udot_v32i8:
3645 ; CHECK-SD: // %bb.0: // %entry
3646 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
3647 ; CHECK-SD-NEXT: ldp q1, q3, [x0]
3648 ; CHECK-SD-NEXT: ldp q2, q4, [x1]
3649 ; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b
3650 ; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b
3651 ; CHECK-SD-NEXT: addv s0, v0.4s
3652 ; CHECK-SD-NEXT: fmov w8, s0
3653 ; CHECK-SD-NEXT: add w0, w8, w2
3654 ; CHECK-SD-NEXT: ret
3656 ; CHECK-GI-LABEL: test_udot_v32i8:
3657 ; CHECK-GI: // %bb.0: // %entry
3658 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
3659 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
3660 ; CHECK-GI-NEXT: ldp q2, q3, [x0]
3661 ; CHECK-GI-NEXT: ldp q4, q5, [x1]
3662 ; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b
3663 ; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b
3664 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
3665 ; CHECK-GI-NEXT: addv s0, v0.4s
3666 ; CHECK-GI-NEXT: fmov w8, s0
3667 ; CHECK-GI-NEXT: add w0, w8, w2
3668 ; CHECK-GI-NEXT: ret
3670 %0 = load <32 x i8>, ptr %a
3671 %1 = zext <32 x i8> %0 to <32 x i32>
3672 %2 = load <32 x i8>, ptr %b
3673 %3 = zext <32 x i8> %2 to <32 x i32>
3674 %4 = mul nuw nsw <32 x i32> %3, %1
3675 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
3676 %op.extra = add i32 %5, %sum
3680 define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
3681 ; CHECK-SD-LABEL: test_udot_v32i8_nomla:
3682 ; CHECK-SD: // %bb.0: // %entry
3683 ; CHECK-SD-NEXT: movi v0.16b, #1
3684 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
3685 ; CHECK-SD-NEXT: ldp q2, q3, [x0]
3686 ; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
3687 ; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b
3688 ; CHECK-SD-NEXT: addv s0, v1.4s
3689 ; CHECK-SD-NEXT: fmov w0, s0
3690 ; CHECK-SD-NEXT: ret
3692 ; CHECK-GI-LABEL: test_udot_v32i8_nomla:
3693 ; CHECK-GI: // %bb.0: // %entry
3694 ; CHECK-GI-NEXT: movi v0.16b, #1
3695 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
3696 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
3697 ; CHECK-GI-NEXT: ldp q3, q4, [x0]
3698 ; CHECK-GI-NEXT: udot v2.4s, v3.16b, v0.16b
3699 ; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b
3700 ; CHECK-GI-NEXT: add v0.4s, v2.4s, v1.4s
3701 ; CHECK-GI-NEXT: addv s0, v0.4s
3702 ; CHECK-GI-NEXT: fmov w0, s0
3703 ; CHECK-GI-NEXT: ret
3705 %0 = load <32 x i8>, ptr %a1
3706 %1 = zext <32 x i8> %0 to <32 x i32>
3707 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
3710 define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3711 ; CHECK-SD-LABEL: test_sdot_v32i8:
3712 ; CHECK-SD: // %bb.0: // %entry
3713 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
3714 ; CHECK-SD-NEXT: ldp q1, q3, [x0]
3715 ; CHECK-SD-NEXT: ldp q2, q4, [x1]
3716 ; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v3.16b
3717 ; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b
3718 ; CHECK-SD-NEXT: addv s0, v0.4s
3719 ; CHECK-SD-NEXT: fmov w8, s0
3720 ; CHECK-SD-NEXT: add w0, w8, w2
3721 ; CHECK-SD-NEXT: ret
3723 ; CHECK-GI-LABEL: test_sdot_v32i8:
3724 ; CHECK-GI: // %bb.0: // %entry
3725 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
3726 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
3727 ; CHECK-GI-NEXT: ldp q2, q3, [x0]
3728 ; CHECK-GI-NEXT: ldp q4, q5, [x1]
3729 ; CHECK-GI-NEXT: sdot v1.4s, v4.16b, v2.16b
3730 ; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b
3731 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
3732 ; CHECK-GI-NEXT: addv s0, v0.4s
3733 ; CHECK-GI-NEXT: fmov w8, s0
3734 ; CHECK-GI-NEXT: add w0, w8, w2
3735 ; CHECK-GI-NEXT: ret
3737 %0 = load <32 x i8>, ptr %a
3738 %1 = sext <32 x i8> %0 to <32 x i32>
3739 %2 = load <32 x i8>, ptr %b
3740 %3 = sext <32 x i8> %2 to <32 x i32>
3741 %4 = mul nsw <32 x i32> %3, %1
3742 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
3743 %op.extra = add nsw i32 %5, %sum
3747 define i32 @test_sdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
3748 ; CHECK-SD-LABEL: test_sdot_v32i8_double:
3749 ; CHECK-SD: // %bb.0: // %entry
3750 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
3751 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
3752 ; CHECK-SD-NEXT: sdot v17.4s, v1.16b, v3.16b
3753 ; CHECK-SD-NEXT: sdot v16.4s, v5.16b, v7.16b
3754 ; CHECK-SD-NEXT: sdot v17.4s, v0.16b, v2.16b
3755 ; CHECK-SD-NEXT: sdot v16.4s, v4.16b, v6.16b
3756 ; CHECK-SD-NEXT: add v0.4s, v17.4s, v16.4s
3757 ; CHECK-SD-NEXT: addv s0, v0.4s
3758 ; CHECK-SD-NEXT: fmov w0, s0
3759 ; CHECK-SD-NEXT: ret
3761 ; CHECK-GI-LABEL: test_sdot_v32i8_double:
3762 ; CHECK-GI: // %bb.0: // %entry
3763 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
3764 ; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
3765 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
3766 ; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
3767 ; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v2.16b
3768 ; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v3.16b
3769 ; CHECK-GI-NEXT: sdot v17.4s, v5.16b, v7.16b
3770 ; CHECK-GI-NEXT: sdot v19.4s, v4.16b, v6.16b
3771 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v18.4s
3772 ; CHECK-GI-NEXT: add v1.4s, v19.4s, v17.4s
3773 ; CHECK-GI-NEXT: addv s0, v0.4s
3774 ; CHECK-GI-NEXT: addv s1, v1.4s
3775 ; CHECK-GI-NEXT: fmov w8, s0
3776 ; CHECK-GI-NEXT: fmov w9, s1
3777 ; CHECK-GI-NEXT: add w0, w8, w9
3778 ; CHECK-GI-NEXT: ret
3780 %az = sext <32 x i8> %a to <32 x i32>
3781 %bz = sext <32 x i8> %b to <32 x i32>
3782 %m1 = mul nuw nsw <32 x i32> %az, %bz
3783 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
3784 %cz = sext <32 x i8> %c to <32 x i32>
3785 %dz = sext <32 x i8> %d to <32 x i32>
3786 %m2 = mul nuw nsw <32 x i32> %cz, %dz
3787 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
3788 %x = add i32 %r1, %r2
3792 define i32 @test_sdot_v32i8_double_nomla(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
3793 ; CHECK-SD-LABEL: test_sdot_v32i8_double_nomla:
3794 ; CHECK-SD: // %bb.0: // %entry
3795 ; CHECK-SD-NEXT: movi v2.16b, #1
3796 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
3797 ; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
3798 ; CHECK-SD-NEXT: sdot v6.4s, v1.16b, v2.16b
3799 ; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v2.16b
3800 ; CHECK-SD-NEXT: sdot v6.4s, v0.16b, v2.16b
3801 ; CHECK-SD-NEXT: sdot v3.4s, v4.16b, v2.16b
3802 ; CHECK-SD-NEXT: add v0.4s, v6.4s, v3.4s
3803 ; CHECK-SD-NEXT: addv s0, v0.4s
3804 ; CHECK-SD-NEXT: fmov w0, s0
3805 ; CHECK-SD-NEXT: ret
3807 ; CHECK-GI-LABEL: test_sdot_v32i8_double_nomla:
3808 ; CHECK-GI: // %bb.0: // %entry
3809 ; CHECK-GI-NEXT: movi v2.16b, #1
3810 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
3811 ; CHECK-GI-NEXT: movi v6.2d, #0000000000000000
3812 ; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
3813 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
3814 ; CHECK-GI-NEXT: sdot v3.4s, v0.16b, v2.16b
3815 ; CHECK-GI-NEXT: sdot v6.4s, v5.16b, v2.16b
3816 ; CHECK-GI-NEXT: sdot v7.4s, v1.16b, v2.16b
3817 ; CHECK-GI-NEXT: sdot v16.4s, v4.16b, v2.16b
3818 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v7.4s
3819 ; CHECK-GI-NEXT: add v1.4s, v16.4s, v6.4s
3820 ; CHECK-GI-NEXT: addv s0, v0.4s
3821 ; CHECK-GI-NEXT: addv s1, v1.4s
3822 ; CHECK-GI-NEXT: fmov w8, s0
3823 ; CHECK-GI-NEXT: fmov w9, s1
3824 ; CHECK-GI-NEXT: add w0, w8, w9
3825 ; CHECK-GI-NEXT: ret
3827 %az = sext <32 x i8> %a to <32 x i32>
3828 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %az)
3829 %cz = sext <32 x i8> %c to <32 x i32>
3830 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %cz)
3831 %x = add i32 %r1, %r2
3835 define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3836 ; CHECK-SD-LABEL: test_usdot_v32i8:
3837 ; CHECK-SD: // %bb.0: // %entry
3838 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
3839 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
3840 ; CHECK-SD-NEXT: ldp q2, q3, [x0]
3841 ; CHECK-SD-NEXT: ldp q4, q5, [x1]
3842 ; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
3843 ; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
3844 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
3845 ; CHECK-SD-NEXT: addv s0, v0.4s
3846 ; CHECK-SD-NEXT: fmov w8, s0
3847 ; CHECK-SD-NEXT: add w0, w8, w2
3848 ; CHECK-SD-NEXT: ret
3850 ; CHECK-GI-LABEL: test_usdot_v32i8:
3851 ; CHECK-GI: // %bb.0: // %entry
3852 ; CHECK-GI-NEXT: ldp q0, q1, [x1]
3853 ; CHECK-GI-NEXT: ldp q2, q3, [x0]
3854 ; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0
3855 ; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
3856 ; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0
3857 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
3858 ; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0
3859 ; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0
3860 ; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0
3861 ; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0
3862 ; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0
3863 ; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0
3864 ; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0
3865 ; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0
3866 ; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0
3867 ; CHECK-GI-NEXT: ushll2 v21.4s, v2.8h, #0
3868 ; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0
3869 ; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0
3870 ; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
3871 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
3872 ; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0
3873 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
3874 ; CHECK-GI-NEXT: mul v16.4s, v16.4s, v20.4s
3875 ; CHECK-GI-NEXT: mul v17.4s, v17.4s, v21.4s
3876 ; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0
3877 ; CHECK-GI-NEXT: mul v18.4s, v18.4s, v22.4s
3878 ; CHECK-GI-NEXT: mul v19.4s, v19.4s, v23.4s
3879 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
3880 ; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0
3881 ; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
3882 ; CHECK-GI-NEXT: mla v16.4s, v4.4s, v6.4s
3883 ; CHECK-GI-NEXT: mla v17.4s, v0.4s, v2.4s
3884 ; CHECK-GI-NEXT: mla v18.4s, v5.4s, v7.4s
3885 ; CHECK-GI-NEXT: mla v19.4s, v1.4s, v3.4s
3886 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
3887 ; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
3888 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
3889 ; CHECK-GI-NEXT: addv s0, v0.4s
3890 ; CHECK-GI-NEXT: fmov w8, s0
3891 ; CHECK-GI-NEXT: add w0, w8, w2
3892 ; CHECK-GI-NEXT: ret
3894 %0 = load <32 x i8>, ptr %a
3895 %1 = zext <32 x i8> %0 to <32 x i32>
3896 %2 = load <32 x i8>, ptr %b
3897 %3 = sext <32 x i8> %2 to <32 x i32>
3898 %4 = mul nsw <32 x i32> %3, %1
3899 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
3900 %op.extra = add nsw i32 %5, %sum
3904 define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
3905 ; CHECK-SD-LABEL: test_usdot_v32i8_double:
3906 ; CHECK-SD: // %bb.0: // %entry
3907 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
3908 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
3909 ; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
3910 ; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
3911 ; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
3912 ; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
3913 ; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
3914 ; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
3915 ; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
3916 ; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
3917 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
3918 ; CHECK-SD-NEXT: addv s0, v0.4s
3919 ; CHECK-SD-NEXT: fmov w0, s0
3920 ; CHECK-SD-NEXT: ret
3922 ; CHECK-GI-LABEL: test_usdot_v32i8_double:
3923 ; CHECK-GI: // %bb.0: // %entry
3924 ; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
3925 ; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
3926 ; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
3927 ; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
3928 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
3929 ; CHECK-GI-NEXT: .cfi_offset b8, -8
3930 ; CHECK-GI-NEXT: .cfi_offset b9, -16
3931 ; CHECK-GI-NEXT: .cfi_offset b10, -24
3932 ; CHECK-GI-NEXT: .cfi_offset b11, -32
3933 ; CHECK-GI-NEXT: .cfi_offset b12, -40
3934 ; CHECK-GI-NEXT: .cfi_offset b13, -48
3935 ; CHECK-GI-NEXT: .cfi_offset b14, -56
3936 ; CHECK-GI-NEXT: .cfi_offset b15, -64
3937 ; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0
3938 ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
3939 ; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0
3940 ; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
3941 ; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0
3942 ; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
3943 ; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0
3944 ; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
3945 ; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0
3946 ; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
3947 ; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0
3948 ; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0
3949 ; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0
3950 ; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0
3951 ; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0
3952 ; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0
3953 ; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0
3954 ; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0
3955 ; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0
3956 ; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0
3957 ; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0
3958 ; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0
3959 ; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0
3960 ; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0
3961 ; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0
3962 ; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0
3963 ; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
3964 ; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0
3965 ; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0
3966 ; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0
3967 ; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0
3968 ; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0
3969 ; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s
3970 ; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s
3971 ; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s
3972 ; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s
3973 ; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s
3974 ; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s
3975 ; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
3976 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
3977 ; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s
3978 ; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
3979 ; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s
3980 ; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
3981 ; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
3982 ; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0
3983 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
3984 ; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0
3985 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
3986 ; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
3987 ; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
3988 ; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
3989 ; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
3990 ; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
3991 ; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
3992 ; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0
3993 ; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
3994 ; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0
3995 ; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
3996 ; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s
3997 ; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s
3998 ; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s
3999 ; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s
4000 ; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s
4001 ; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s
4002 ; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s
4003 ; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s
4004 ; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s
4005 ; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s
4006 ; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s
4007 ; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s
4008 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
4009 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
4010 ; CHECK-GI-NEXT: addv s0, v0.4s
4011 ; CHECK-GI-NEXT: addv s1, v1.4s
4012 ; CHECK-GI-NEXT: fmov w8, s0
4013 ; CHECK-GI-NEXT: fmov w9, s1
4014 ; CHECK-GI-NEXT: add w0, w8, w9
4015 ; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
4016 ; CHECK-GI-NEXT: ret
4018 %az = zext <32 x i8> %a to <32 x i32>
4019 %bz = sext <32 x i8> %b to <32 x i32>
4020 %m1 = mul nuw nsw <32 x i32> %az, %bz
4021 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
4022 %cz = zext <32 x i8> %c to <32 x i32>
4023 %dz = sext <32 x i8> %d to <32 x i32>
4024 %m2 = mul nuw nsw <32 x i32> %cz, %dz
4025 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
4026 %x = add i32 %r1, %r2
4031 define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
4032 ; CHECK-SD-LABEL: test_udot_v33i8:
4033 ; CHECK-SD: // %bb.0: // %entry
4034 ; CHECK-SD-NEXT: ldr b0, [x0, #32]
4035 ; CHECK-SD-NEXT: ldr b1, [x1, #32]
4036 ; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
4037 ; CHECK-SD-NEXT: ldp q3, q4, [x1]
4038 ; CHECK-SD-NEXT: umull v0.8h, v1.8b, v0.8b
4039 ; CHECK-SD-NEXT: ldp q1, q2, [x0]
4040 ; CHECK-SD-NEXT: umull v5.8h, v4.8b, v2.8b
4041 ; CHECK-SD-NEXT: umull v6.8h, v3.8b, v1.8b
4042 ; CHECK-SD-NEXT: umull2 v2.8h, v4.16b, v2.16b
4043 ; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
4044 ; CHECK-SD-NEXT: umull2 v1.8h, v3.16b, v1.16b
4045 ; CHECK-SD-NEXT: mov v7.s[0], v0.s[0]
4046 ; CHECK-SD-NEXT: uaddl2 v3.4s, v6.8h, v5.8h
4047 ; CHECK-SD-NEXT: uaddl2 v0.4s, v1.8h, v2.8h
4048 ; CHECK-SD-NEXT: uaddl v1.4s, v1.4h, v2.4h
4049 ; CHECK-SD-NEXT: add v0.4s, v3.4s, v0.4s
4050 ; CHECK-SD-NEXT: uaddw v2.4s, v7.4s, v6.4h
4051 ; CHECK-SD-NEXT: uaddw v2.4s, v2.4s, v5.4h
4052 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
4053 ; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s
4054 ; CHECK-SD-NEXT: addv s0, v0.4s
4055 ; CHECK-SD-NEXT: fmov w8, s0
4056 ; CHECK-SD-NEXT: add w0, w8, w2
4057 ; CHECK-SD-NEXT: ret
4059 ; CHECK-GI-LABEL: test_udot_v33i8:
4060 ; CHECK-GI: // %bb.0: // %entry
4061 ; CHECK-GI-NEXT: ldp q19, q4, [x1]
4062 ; CHECK-GI-NEXT: mov v25.s[0], wzr
4063 ; CHECK-GI-NEXT: ldp q7, q5, [x0]
4064 ; CHECK-GI-NEXT: umov w8, v19.b[0]
4065 ; CHECK-GI-NEXT: umov w9, v19.b[4]
4066 ; CHECK-GI-NEXT: umov w10, v19.b[8]
4067 ; CHECK-GI-NEXT: umov w11, v19.b[1]
4068 ; CHECK-GI-NEXT: umov w12, v19.b[6]
4069 ; CHECK-GI-NEXT: umov w13, v19.b[12]
4070 ; CHECK-GI-NEXT: umov w14, v4.b[0]
4071 ; CHECK-GI-NEXT: umov w15, v4.b[4]
4072 ; CHECK-GI-NEXT: umov w16, v4.b[12]
4073 ; CHECK-GI-NEXT: mov v25.s[1], wzr
4074 ; CHECK-GI-NEXT: mov v0.s[0], w8
4075 ; CHECK-GI-NEXT: umov w8, v19.b[5]
4076 ; CHECK-GI-NEXT: mov v2.s[0], w9
4077 ; CHECK-GI-NEXT: umov w9, v19.b[9]
4078 ; CHECK-GI-NEXT: mov v1.s[0], w10
4079 ; CHECK-GI-NEXT: umov w10, v19.b[2]
4080 ; CHECK-GI-NEXT: mov v6.s[0], w13
4081 ; CHECK-GI-NEXT: umov w13, v19.b[3]
4082 ; CHECK-GI-NEXT: mov v3.s[0], w14
4083 ; CHECK-GI-NEXT: umov w14, v19.b[13]
4084 ; CHECK-GI-NEXT: mov v16.s[0], w15
4085 ; CHECK-GI-NEXT: umov w15, v4.b[8]
4086 ; CHECK-GI-NEXT: mov v0.s[1], w11
4087 ; CHECK-GI-NEXT: umov w11, v19.b[10]
4088 ; CHECK-GI-NEXT: mov v2.s[1], w8
4089 ; CHECK-GI-NEXT: ldrb w8, [x0, #32]
4090 ; CHECK-GI-NEXT: mov v1.s[1], w9
4091 ; CHECK-GI-NEXT: ldrb w9, [x1, #32]
4092 ; CHECK-GI-NEXT: mov v17.s[0], w16
4093 ; CHECK-GI-NEXT: umov w16, v19.b[14]
4094 ; CHECK-GI-NEXT: mov v25.s[2], wzr
4095 ; CHECK-GI-NEXT: mul w8, w9, w8
4096 ; CHECK-GI-NEXT: mov v6.s[1], w14
4097 ; CHECK-GI-NEXT: umov w14, v4.b[1]
4098 ; CHECK-GI-NEXT: mov v0.s[2], w10
4099 ; CHECK-GI-NEXT: umov w10, v19.b[7]
4100 ; CHECK-GI-NEXT: mov v2.s[2], w12
4101 ; CHECK-GI-NEXT: umov w12, v19.b[11]
4102 ; CHECK-GI-NEXT: mov v1.s[2], w11
4103 ; CHECK-GI-NEXT: umov w11, v4.b[5]
4104 ; CHECK-GI-NEXT: mov v18.s[0], w15
4105 ; CHECK-GI-NEXT: umov w15, v19.b[15]
4106 ; CHECK-GI-NEXT: umov w9, v5.b[2]
4107 ; CHECK-GI-NEXT: mov v6.s[2], w16
4108 ; CHECK-GI-NEXT: umov w16, v7.b[0]
4109 ; CHECK-GI-NEXT: mov v3.s[1], w14
4110 ; CHECK-GI-NEXT: mov v0.s[3], w13
4111 ; CHECK-GI-NEXT: umov w13, v7.b[4]
4112 ; CHECK-GI-NEXT: mov v2.s[3], w10
4113 ; CHECK-GI-NEXT: umov w10, v4.b[6]
4114 ; CHECK-GI-NEXT: mov v1.s[3], w12
4115 ; CHECK-GI-NEXT: umov w12, v4.b[13]
4116 ; CHECK-GI-NEXT: mov v16.s[1], w11
4117 ; CHECK-GI-NEXT: umov w11, v4.b[9]
4118 ; CHECK-GI-NEXT: umov w14, v7.b[5]
4119 ; CHECK-GI-NEXT: mov v19.s[0], w16
4120 ; CHECK-GI-NEXT: umov w16, v7.b[1]
4121 ; CHECK-GI-NEXT: mov v6.s[3], w15
4122 ; CHECK-GI-NEXT: mov v20.s[0], w13
4123 ; CHECK-GI-NEXT: umov w13, v4.b[2]
4124 ; CHECK-GI-NEXT: umov w15, v7.b[6]
4125 ; CHECK-GI-NEXT: mov v17.s[1], w12
4126 ; CHECK-GI-NEXT: umov w12, v4.b[14]
4127 ; CHECK-GI-NEXT: mov v27.s[0], w8
4128 ; CHECK-GI-NEXT: mov v16.s[2], w10
4129 ; CHECK-GI-NEXT: umov w10, v4.b[7]
4130 ; CHECK-GI-NEXT: mov v18.s[1], w11
4131 ; CHECK-GI-NEXT: umov w11, v4.b[10]
4132 ; CHECK-GI-NEXT: mov v19.s[1], w16
4133 ; CHECK-GI-NEXT: umov w16, v5.b[4]
4134 ; CHECK-GI-NEXT: mov v20.s[1], w14
4135 ; CHECK-GI-NEXT: umov w14, v4.b[15]
4136 ; CHECK-GI-NEXT: mov v3.s[2], w13
4137 ; CHECK-GI-NEXT: mov v17.s[2], w12
4138 ; CHECK-GI-NEXT: umov w12, v7.b[12]
4139 ; CHECK-GI-NEXT: umov w13, v7.b[7]
4140 ; CHECK-GI-NEXT: mov v16.s[3], w10
4141 ; CHECK-GI-NEXT: umov w10, v7.b[8]
4142 ; CHECK-GI-NEXT: umov w8, v7.b[3]
4143 ; CHECK-GI-NEXT: mov v18.s[2], w11
4144 ; CHECK-GI-NEXT: umov w11, v7.b[2]
4145 ; CHECK-GI-NEXT: mov v23.s[0], w16
4146 ; CHECK-GI-NEXT: mov v20.s[2], w15
4147 ; CHECK-GI-NEXT: umov w15, v5.b[12]
4148 ; CHECK-GI-NEXT: umov w16, v7.b[14]
4149 ; CHECK-GI-NEXT: mov v17.s[3], w14
4150 ; CHECK-GI-NEXT: umov w14, v7.b[13]
4151 ; CHECK-GI-NEXT: mov v22.s[0], w12
4152 ; CHECK-GI-NEXT: umov w12, v7.b[9]
4153 ; CHECK-GI-NEXT: mov v21.s[0], w10
4154 ; CHECK-GI-NEXT: umov w10, v4.b[3]
4155 ; CHECK-GI-NEXT: mov v19.s[2], w11
4156 ; CHECK-GI-NEXT: umov w11, v5.b[0]
4157 ; CHECK-GI-NEXT: mov v27.s[1], wzr
4158 ; CHECK-GI-NEXT: mov v20.s[3], w13
4159 ; CHECK-GI-NEXT: umov w13, v5.b[5]
4160 ; CHECK-GI-NEXT: mov v24.s[0], w15
4161 ; CHECK-GI-NEXT: mov v22.s[1], w14
4162 ; CHECK-GI-NEXT: umov w14, v5.b[8]
4163 ; CHECK-GI-NEXT: umov w15, v4.b[11]
4164 ; CHECK-GI-NEXT: mov v21.s[1], w12
4165 ; CHECK-GI-NEXT: umov w12, v5.b[13]
4166 ; CHECK-GI-NEXT: mov v25.s[3], wzr
4167 ; CHECK-GI-NEXT: mov v4.s[0], w11
4168 ; CHECK-GI-NEXT: umov w11, v5.b[1]
4169 ; CHECK-GI-NEXT: mov v3.s[3], w10
4170 ; CHECK-GI-NEXT: mov v23.s[1], w13
4171 ; CHECK-GI-NEXT: umov w13, v5.b[6]
4172 ; CHECK-GI-NEXT: mov v19.s[3], w8
4173 ; CHECK-GI-NEXT: mov v22.s[2], w16
4174 ; CHECK-GI-NEXT: umov w16, v5.b[9]
4175 ; CHECK-GI-NEXT: mov v26.s[0], w14
4176 ; CHECK-GI-NEXT: mov v24.s[1], w12
4177 ; CHECK-GI-NEXT: umov w12, v5.b[14]
4178 ; CHECK-GI-NEXT: umov w14, v7.b[10]
4179 ; CHECK-GI-NEXT: mov v4.s[1], w11
4180 ; CHECK-GI-NEXT: umov w11, v7.b[15]
4181 ; CHECK-GI-NEXT: mov v18.s[3], w15
4182 ; CHECK-GI-NEXT: mov v23.s[2], w13
4183 ; CHECK-GI-NEXT: umov w13, v5.b[7]
4184 ; CHECK-GI-NEXT: mul v2.4s, v2.4s, v20.4s
4185 ; CHECK-GI-NEXT: mov v26.s[1], w16
4186 ; CHECK-GI-NEXT: umov w16, v5.b[10]
4187 ; CHECK-GI-NEXT: mov v27.s[2], wzr
4188 ; CHECK-GI-NEXT: mov v24.s[2], w12
4189 ; CHECK-GI-NEXT: umov w12, v5.b[15]
4190 ; CHECK-GI-NEXT: mov v21.s[2], w14
4191 ; CHECK-GI-NEXT: umov w14, v7.b[11]
4192 ; CHECK-GI-NEXT: mov v4.s[2], w9
4193 ; CHECK-GI-NEXT: umov w9, v5.b[3]
4194 ; CHECK-GI-NEXT: mov v22.s[3], w11
4195 ; CHECK-GI-NEXT: umov w11, v5.b[11]
4196 ; CHECK-GI-NEXT: mov v23.s[3], w13
4197 ; CHECK-GI-NEXT: mov v26.s[2], w16
4198 ; CHECK-GI-NEXT: mla v2.4s, v0.4s, v19.4s
4199 ; CHECK-GI-NEXT: mov v27.s[3], wzr
4200 ; CHECK-GI-NEXT: mov v24.s[3], w12
4201 ; CHECK-GI-NEXT: mov v21.s[3], w14
4202 ; CHECK-GI-NEXT: mov v4.s[3], w9
4203 ; CHECK-GI-NEXT: mul v5.4s, v6.4s, v22.4s
4204 ; CHECK-GI-NEXT: mul v6.4s, v16.4s, v23.4s
4205 ; CHECK-GI-NEXT: add v16.4s, v25.4s, v25.4s
4206 ; CHECK-GI-NEXT: mov v26.s[3], w11
4207 ; CHECK-GI-NEXT: mul v7.4s, v17.4s, v24.4s
4208 ; CHECK-GI-NEXT: add v0.4s, v25.4s, v16.4s
4209 ; CHECK-GI-NEXT: mla v5.4s, v1.4s, v21.4s
4210 ; CHECK-GI-NEXT: mla v6.4s, v3.4s, v4.4s
4211 ; CHECK-GI-NEXT: add v3.4s, v16.4s, v16.4s
4212 ; CHECK-GI-NEXT: mla v7.4s, v18.4s, v26.4s
4213 ; CHECK-GI-NEXT: add v0.4s, v27.4s, v0.4s
4214 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v5.4s
4215 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v3.4s
4216 ; CHECK-GI-NEXT: add v2.4s, v6.4s, v7.4s
4217 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
4218 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
4219 ; CHECK-GI-NEXT: addv s0, v0.4s
4220 ; CHECK-GI-NEXT: fmov w8, s0
4221 ; CHECK-GI-NEXT: add w0, w8, w2
4222 ; CHECK-GI-NEXT: ret
4224 %0 = load <33 x i8>, ptr %a
4225 %1 = zext <33 x i8> %0 to <33 x i32>
4226 %2 = load <33 x i8>, ptr %b
4227 %3 = zext <33 x i8> %2 to <33 x i32>
4228 %4 = mul nuw nsw <33 x i32> %3, %1
4229 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
4230 %op.extra = add i32 %5, %sum
4234 define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
4235 ; CHECK-SD-LABEL: test_udot_v33i8_nomla:
4236 ; CHECK-SD: // %bb.0: // %entry
4237 ; CHECK-SD-NEXT: ldr b1, [x0, #32]
4238 ; CHECK-SD-NEXT: ldp q3, q2, [x0]
4239 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
4240 ; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
4241 ; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
4242 ; CHECK-SD-NEXT: ushll v5.8h, v3.8b, #0
4243 ; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
4244 ; CHECK-SD-NEXT: ushll2 v3.8h, v3.16b, #0
4245 ; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
4246 ; CHECK-SD-NEXT: uaddl2 v6.4s, v5.8h, v4.8h
4247 ; CHECK-SD-NEXT: mov v0.s[0], v1.s[0]
4248 ; CHECK-SD-NEXT: uaddl2 v1.4s, v3.8h, v2.8h
4249 ; CHECK-SD-NEXT: uaddl v2.4s, v3.4h, v2.4h
4250 ; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
4251 ; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v5.4h
4252 ; CHECK-SD-NEXT: add v1.4s, v2.4s, v1.4s
4253 ; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v4.4h
4254 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
4255 ; CHECK-SD-NEXT: addv s0, v0.4s
4256 ; CHECK-SD-NEXT: fmov w0, s0
4257 ; CHECK-SD-NEXT: ret
4259 ; CHECK-GI-LABEL: test_udot_v33i8_nomla:
4260 ; CHECK-GI: // %bb.0: // %entry
4261 ; CHECK-GI-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill
4262 ; CHECK-GI-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill
4263 ; CHECK-GI-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill
4264 ; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
4265 ; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
4266 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
4267 ; CHECK-GI-NEXT: .cfi_offset w19, -8
4268 ; CHECK-GI-NEXT: .cfi_offset w20, -16
4269 ; CHECK-GI-NEXT: .cfi_offset w21, -24
4270 ; CHECK-GI-NEXT: .cfi_offset w22, -32
4271 ; CHECK-GI-NEXT: .cfi_offset w23, -40
4272 ; CHECK-GI-NEXT: .cfi_offset w24, -48
4273 ; CHECK-GI-NEXT: .cfi_offset w25, -56
4274 ; CHECK-GI-NEXT: .cfi_offset w26, -64
4275 ; CHECK-GI-NEXT: .cfi_offset w27, -80
4276 ; CHECK-GI-NEXT: ldp q2, q1, [x0]
4277 ; CHECK-GI-NEXT: mov v0.s[0], wzr
4278 ; CHECK-GI-NEXT: ldrb w2, [x0, #32]
4279 ; CHECK-GI-NEXT: umov w20, v2.b[0]
4280 ; CHECK-GI-NEXT: umov w21, v2.b[4]
4281 ; CHECK-GI-NEXT: umov w22, v2.b[8]
4282 ; CHECK-GI-NEXT: umov w23, v2.b[12]
4283 ; CHECK-GI-NEXT: umov w24, v1.b[0]
4284 ; CHECK-GI-NEXT: umov w25, v1.b[4]
4285 ; CHECK-GI-NEXT: umov w26, v1.b[8]
4286 ; CHECK-GI-NEXT: umov w27, v1.b[12]
4287 ; CHECK-GI-NEXT: umov w0, v2.b[1]
4288 ; CHECK-GI-NEXT: umov w12, v2.b[2]
4289 ; CHECK-GI-NEXT: umov w8, v2.b[3]
4290 ; CHECK-GI-NEXT: umov w3, v2.b[5]
4291 ; CHECK-GI-NEXT: umov w14, v2.b[6]
4292 ; CHECK-GI-NEXT: umov w9, v2.b[7]
4293 ; CHECK-GI-NEXT: umov w4, v2.b[9]
4294 ; CHECK-GI-NEXT: umov w15, v2.b[10]
4295 ; CHECK-GI-NEXT: umov w10, v2.b[11]
4296 ; CHECK-GI-NEXT: umov w5, v2.b[13]
4297 ; CHECK-GI-NEXT: umov w16, v2.b[14]
4298 ; CHECK-GI-NEXT: umov w11, v2.b[15]
4299 ; CHECK-GI-NEXT: umov w6, v1.b[1]
4300 ; CHECK-GI-NEXT: umov w7, v1.b[5]
4301 ; CHECK-GI-NEXT: umov w19, v1.b[9]
4302 ; CHECK-GI-NEXT: mov v2.s[0], w20
4303 ; CHECK-GI-NEXT: mov v3.s[0], w21
4304 ; CHECK-GI-NEXT: mov v4.s[0], w22
4305 ; CHECK-GI-NEXT: mov v5.s[0], w23
4306 ; CHECK-GI-NEXT: mov v6.s[0], w24
4307 ; CHECK-GI-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload
4308 ; CHECK-GI-NEXT: mov v7.s[0], w25
4309 ; CHECK-GI-NEXT: mov v16.s[0], w26
4310 ; CHECK-GI-NEXT: umov w20, v1.b[13]
4311 ; CHECK-GI-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload
4312 ; CHECK-GI-NEXT: mov v17.s[0], w27
4313 ; CHECK-GI-NEXT: mov v18.s[0], w2
4314 ; CHECK-GI-NEXT: umov w17, v1.b[2]
4315 ; CHECK-GI-NEXT: umov w1, v1.b[6]
4316 ; CHECK-GI-NEXT: umov w2, v1.b[10]
4317 ; CHECK-GI-NEXT: umov w21, v1.b[14]
4318 ; CHECK-GI-NEXT: mov v2.s[1], w0
4319 ; CHECK-GI-NEXT: mov v3.s[1], w3
4320 ; CHECK-GI-NEXT: mov v4.s[1], w4
4321 ; CHECK-GI-NEXT: mov v5.s[1], w5
4322 ; CHECK-GI-NEXT: mov v6.s[1], w6
4323 ; CHECK-GI-NEXT: mov v7.s[1], w7
4324 ; CHECK-GI-NEXT: mov v16.s[1], w19
4325 ; CHECK-GI-NEXT: mov v17.s[1], w20
4326 ; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
4327 ; CHECK-GI-NEXT: mov v18.s[1], wzr
4328 ; CHECK-GI-NEXT: mov v0.s[1], wzr
4329 ; CHECK-GI-NEXT: umov w13, v1.b[3]
4330 ; CHECK-GI-NEXT: umov w18, v1.b[7]
4331 ; CHECK-GI-NEXT: umov w0, v1.b[11]
4332 ; CHECK-GI-NEXT: umov w3, v1.b[15]
4333 ; CHECK-GI-NEXT: mov v2.s[2], w12
4334 ; CHECK-GI-NEXT: mov v3.s[2], w14
4335 ; CHECK-GI-NEXT: mov v4.s[2], w15
4336 ; CHECK-GI-NEXT: mov v5.s[2], w16
4337 ; CHECK-GI-NEXT: mov v6.s[2], w17
4338 ; CHECK-GI-NEXT: mov v7.s[2], w1
4339 ; CHECK-GI-NEXT: mov v16.s[2], w2
4340 ; CHECK-GI-NEXT: mov v17.s[2], w21
4341 ; CHECK-GI-NEXT: mov v18.s[2], wzr
4342 ; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
4343 ; CHECK-GI-NEXT: mov v0.s[2], wzr
4344 ; CHECK-GI-NEXT: mov v2.s[3], w8
4345 ; CHECK-GI-NEXT: mov v3.s[3], w9
4346 ; CHECK-GI-NEXT: mov v4.s[3], w10
4347 ; CHECK-GI-NEXT: mov v5.s[3], w11
4348 ; CHECK-GI-NEXT: mov v6.s[3], w13
4349 ; CHECK-GI-NEXT: mov v7.s[3], w18
4350 ; CHECK-GI-NEXT: mov v16.s[3], w0
4351 ; CHECK-GI-NEXT: mov v17.s[3], w3
4352 ; CHECK-GI-NEXT: mov v18.s[3], wzr
4353 ; CHECK-GI-NEXT: mov v0.s[3], wzr
4354 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
4355 ; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
4356 ; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
4357 ; CHECK-GI-NEXT: add v4.4s, v16.4s, v17.4s
4358 ; CHECK-GI-NEXT: add v5.4s, v18.4s, v0.4s
4359 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
4360 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
4361 ; CHECK-GI-NEXT: add v2.4s, v3.4s, v4.4s
4362 ; CHECK-GI-NEXT: add v3.4s, v5.4s, v0.4s
4363 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
4364 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
4365 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
4366 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
4367 ; CHECK-GI-NEXT: addv s0, v0.4s
4368 ; CHECK-GI-NEXT: fmov w0, s0
4369 ; CHECK-GI-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload
4370 ; CHECK-GI-NEXT: ret
4372 %0 = load <33 x i8>, ptr %a1
4373 %1 = zext <33 x i8> %0 to <33 x i32>
4374 %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1)
4377 define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
4378 ; CHECK-SD-LABEL: test_sdot_v33i8:
4379 ; CHECK-SD: // %bb.0: // %entry
4380 ; CHECK-SD-NEXT: ldr b0, [x0, #32]
4381 ; CHECK-SD-NEXT: ldr b1, [x1, #32]
4382 ; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
4383 ; CHECK-SD-NEXT: ldp q3, q4, [x1]
4384 ; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
4385 ; CHECK-SD-NEXT: ldp q1, q2, [x0]
4386 ; CHECK-SD-NEXT: smull v5.8h, v4.8b, v2.8b
4387 ; CHECK-SD-NEXT: smull v6.8h, v3.8b, v1.8b
4388 ; CHECK-SD-NEXT: smull2 v2.8h, v4.16b, v2.16b
4389 ; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
4390 ; CHECK-SD-NEXT: smull2 v1.8h, v3.16b, v1.16b
4391 ; CHECK-SD-NEXT: mov v7.s[0], v0.s[0]
4392 ; CHECK-SD-NEXT: saddl2 v3.4s, v6.8h, v5.8h
4393 ; CHECK-SD-NEXT: saddl2 v0.4s, v1.8h, v2.8h
4394 ; CHECK-SD-NEXT: saddl v1.4s, v1.4h, v2.4h
4395 ; CHECK-SD-NEXT: add v0.4s, v3.4s, v0.4s
4396 ; CHECK-SD-NEXT: saddw v2.4s, v7.4s, v6.4h
4397 ; CHECK-SD-NEXT: saddw v2.4s, v2.4s, v5.4h
4398 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
4399 ; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s
4400 ; CHECK-SD-NEXT: addv s0, v0.4s
4401 ; CHECK-SD-NEXT: fmov w8, s0
4402 ; CHECK-SD-NEXT: add w0, w8, w2
4403 ; CHECK-SD-NEXT: ret
4405 ; CHECK-GI-LABEL: test_sdot_v33i8:
4406 ; CHECK-GI: // %bb.0: // %entry
4407 ; CHECK-GI-NEXT: ldp q19, q4, [x1]
4408 ; CHECK-GI-NEXT: mov v25.s[0], wzr
4409 ; CHECK-GI-NEXT: ldp q7, q5, [x0]
4410 ; CHECK-GI-NEXT: smov w8, v19.b[0]
4411 ; CHECK-GI-NEXT: smov w9, v19.b[4]
4412 ; CHECK-GI-NEXT: smov w10, v19.b[8]
4413 ; CHECK-GI-NEXT: smov w11, v19.b[1]
4414 ; CHECK-GI-NEXT: smov w12, v19.b[6]
4415 ; CHECK-GI-NEXT: smov w13, v19.b[12]
4416 ; CHECK-GI-NEXT: smov w14, v4.b[0]
4417 ; CHECK-GI-NEXT: smov w15, v4.b[4]
4418 ; CHECK-GI-NEXT: smov w16, v4.b[12]
4419 ; CHECK-GI-NEXT: mov v25.s[1], wzr
4420 ; CHECK-GI-NEXT: mov v0.s[0], w8
4421 ; CHECK-GI-NEXT: smov w8, v19.b[5]
4422 ; CHECK-GI-NEXT: mov v2.s[0], w9
4423 ; CHECK-GI-NEXT: smov w9, v19.b[9]
4424 ; CHECK-GI-NEXT: mov v1.s[0], w10
4425 ; CHECK-GI-NEXT: smov w10, v19.b[2]
4426 ; CHECK-GI-NEXT: mov v6.s[0], w13
4427 ; CHECK-GI-NEXT: smov w13, v19.b[3]
4428 ; CHECK-GI-NEXT: mov v3.s[0], w14
4429 ; CHECK-GI-NEXT: smov w14, v19.b[13]
4430 ; CHECK-GI-NEXT: mov v16.s[0], w15
4431 ; CHECK-GI-NEXT: smov w15, v4.b[8]
4432 ; CHECK-GI-NEXT: mov v0.s[1], w11
4433 ; CHECK-GI-NEXT: smov w11, v19.b[10]
4434 ; CHECK-GI-NEXT: mov v2.s[1], w8
4435 ; CHECK-GI-NEXT: ldrsb w8, [x0, #32]
4436 ; CHECK-GI-NEXT: mov v1.s[1], w9
4437 ; CHECK-GI-NEXT: ldrsb w9, [x1, #32]
4438 ; CHECK-GI-NEXT: mov v17.s[0], w16
4439 ; CHECK-GI-NEXT: smov w16, v19.b[14]
4440 ; CHECK-GI-NEXT: mov v25.s[2], wzr
4441 ; CHECK-GI-NEXT: mul w8, w9, w8
4442 ; CHECK-GI-NEXT: mov v6.s[1], w14
4443 ; CHECK-GI-NEXT: smov w14, v4.b[1]
4444 ; CHECK-GI-NEXT: mov v0.s[2], w10
4445 ; CHECK-GI-NEXT: smov w10, v19.b[7]
4446 ; CHECK-GI-NEXT: mov v2.s[2], w12
4447 ; CHECK-GI-NEXT: smov w12, v19.b[11]
4448 ; CHECK-GI-NEXT: mov v1.s[2], w11
4449 ; CHECK-GI-NEXT: smov w11, v4.b[5]
4450 ; CHECK-GI-NEXT: mov v18.s[0], w15
4451 ; CHECK-GI-NEXT: smov w15, v19.b[15]
4452 ; CHECK-GI-NEXT: smov w9, v5.b[2]
4453 ; CHECK-GI-NEXT: mov v6.s[2], w16
4454 ; CHECK-GI-NEXT: smov w16, v7.b[0]
4455 ; CHECK-GI-NEXT: mov v3.s[1], w14
4456 ; CHECK-GI-NEXT: mov v0.s[3], w13
4457 ; CHECK-GI-NEXT: smov w13, v7.b[4]
4458 ; CHECK-GI-NEXT: mov v2.s[3], w10
4459 ; CHECK-GI-NEXT: smov w10, v4.b[6]
4460 ; CHECK-GI-NEXT: mov v1.s[3], w12
4461 ; CHECK-GI-NEXT: smov w12, v4.b[13]
4462 ; CHECK-GI-NEXT: mov v16.s[1], w11
4463 ; CHECK-GI-NEXT: smov w11, v4.b[9]
4464 ; CHECK-GI-NEXT: smov w14, v7.b[5]
4465 ; CHECK-GI-NEXT: mov v19.s[0], w16
4466 ; CHECK-GI-NEXT: smov w16, v7.b[1]
4467 ; CHECK-GI-NEXT: mov v6.s[3], w15
4468 ; CHECK-GI-NEXT: mov v20.s[0], w13
4469 ; CHECK-GI-NEXT: smov w13, v4.b[2]
4470 ; CHECK-GI-NEXT: smov w15, v7.b[6]
4471 ; CHECK-GI-NEXT: mov v17.s[1], w12
4472 ; CHECK-GI-NEXT: smov w12, v4.b[14]
4473 ; CHECK-GI-NEXT: mov v27.s[0], w8
4474 ; CHECK-GI-NEXT: mov v16.s[2], w10
4475 ; CHECK-GI-NEXT: smov w10, v4.b[7]
4476 ; CHECK-GI-NEXT: mov v18.s[1], w11
4477 ; CHECK-GI-NEXT: smov w11, v4.b[10]
4478 ; CHECK-GI-NEXT: mov v19.s[1], w16
4479 ; CHECK-GI-NEXT: smov w16, v5.b[4]
4480 ; CHECK-GI-NEXT: mov v20.s[1], w14
4481 ; CHECK-GI-NEXT: smov w14, v4.b[15]
4482 ; CHECK-GI-NEXT: mov v3.s[2], w13
4483 ; CHECK-GI-NEXT: mov v17.s[2], w12
4484 ; CHECK-GI-NEXT: smov w12, v7.b[12]
4485 ; CHECK-GI-NEXT: smov w13, v7.b[7]
4486 ; CHECK-GI-NEXT: mov v16.s[3], w10
4487 ; CHECK-GI-NEXT: smov w10, v7.b[8]
4488 ; CHECK-GI-NEXT: smov w8, v7.b[3]
4489 ; CHECK-GI-NEXT: mov v18.s[2], w11
4490 ; CHECK-GI-NEXT: smov w11, v7.b[2]
4491 ; CHECK-GI-NEXT: mov v23.s[0], w16
4492 ; CHECK-GI-NEXT: mov v20.s[2], w15
4493 ; CHECK-GI-NEXT: smov w15, v5.b[12]
4494 ; CHECK-GI-NEXT: smov w16, v7.b[14]
4495 ; CHECK-GI-NEXT: mov v17.s[3], w14
4496 ; CHECK-GI-NEXT: smov w14, v7.b[13]
4497 ; CHECK-GI-NEXT: mov v22.s[0], w12
4498 ; CHECK-GI-NEXT: smov w12, v7.b[9]
4499 ; CHECK-GI-NEXT: mov v21.s[0], w10
4500 ; CHECK-GI-NEXT: smov w10, v4.b[3]
4501 ; CHECK-GI-NEXT: mov v19.s[2], w11
4502 ; CHECK-GI-NEXT: smov w11, v5.b[0]
4503 ; CHECK-GI-NEXT: mov v27.s[1], wzr
4504 ; CHECK-GI-NEXT: mov v20.s[3], w13
4505 ; CHECK-GI-NEXT: smov w13, v5.b[5]
4506 ; CHECK-GI-NEXT: mov v24.s[0], w15
4507 ; CHECK-GI-NEXT: mov v22.s[1], w14
4508 ; CHECK-GI-NEXT: smov w14, v5.b[8]
4509 ; CHECK-GI-NEXT: smov w15, v4.b[11]
4510 ; CHECK-GI-NEXT: mov v21.s[1], w12
4511 ; CHECK-GI-NEXT: smov w12, v5.b[13]
4512 ; CHECK-GI-NEXT: mov v25.s[3], wzr
4513 ; CHECK-GI-NEXT: mov v4.s[0], w11
4514 ; CHECK-GI-NEXT: smov w11, v5.b[1]
4515 ; CHECK-GI-NEXT: mov v3.s[3], w10
4516 ; CHECK-GI-NEXT: mov v23.s[1], w13
4517 ; CHECK-GI-NEXT: smov w13, v5.b[6]
4518 ; CHECK-GI-NEXT: mov v19.s[3], w8
4519 ; CHECK-GI-NEXT: mov v22.s[2], w16
4520 ; CHECK-GI-NEXT: smov w16, v5.b[9]
4521 ; CHECK-GI-NEXT: mov v26.s[0], w14
4522 ; CHECK-GI-NEXT: mov v24.s[1], w12
4523 ; CHECK-GI-NEXT: smov w12, v5.b[14]
4524 ; CHECK-GI-NEXT: smov w14, v7.b[10]
4525 ; CHECK-GI-NEXT: mov v4.s[1], w11
4526 ; CHECK-GI-NEXT: smov w11, v7.b[15]
4527 ; CHECK-GI-NEXT: mov v18.s[3], w15
4528 ; CHECK-GI-NEXT: mov v23.s[2], w13
4529 ; CHECK-GI-NEXT: smov w13, v5.b[7]
4530 ; CHECK-GI-NEXT: mul v2.4s, v2.4s, v20.4s
4531 ; CHECK-GI-NEXT: mov v26.s[1], w16
4532 ; CHECK-GI-NEXT: smov w16, v5.b[10]
4533 ; CHECK-GI-NEXT: mov v27.s[2], wzr
4534 ; CHECK-GI-NEXT: mov v24.s[2], w12
4535 ; CHECK-GI-NEXT: smov w12, v5.b[15]
4536 ; CHECK-GI-NEXT: mov v21.s[2], w14
4537 ; CHECK-GI-NEXT: smov w14, v7.b[11]
4538 ; CHECK-GI-NEXT: mov v4.s[2], w9
4539 ; CHECK-GI-NEXT: smov w9, v5.b[3]
4540 ; CHECK-GI-NEXT: mov v22.s[3], w11
4541 ; CHECK-GI-NEXT: smov w11, v5.b[11]
4542 ; CHECK-GI-NEXT: mov v23.s[3], w13
4543 ; CHECK-GI-NEXT: mov v26.s[2], w16
4544 ; CHECK-GI-NEXT: mla v2.4s, v0.4s, v19.4s
4545 ; CHECK-GI-NEXT: mov v27.s[3], wzr
4546 ; CHECK-GI-NEXT: mov v24.s[3], w12
4547 ; CHECK-GI-NEXT: mov v21.s[3], w14
4548 ; CHECK-GI-NEXT: mov v4.s[3], w9
4549 ; CHECK-GI-NEXT: mul v5.4s, v6.4s, v22.4s
4550 ; CHECK-GI-NEXT: mul v6.4s, v16.4s, v23.4s
4551 ; CHECK-GI-NEXT: add v16.4s, v25.4s, v25.4s
4552 ; CHECK-GI-NEXT: mov v26.s[3], w11
4553 ; CHECK-GI-NEXT: mul v7.4s, v17.4s, v24.4s
4554 ; CHECK-GI-NEXT: add v0.4s, v25.4s, v16.4s
4555 ; CHECK-GI-NEXT: mla v5.4s, v1.4s, v21.4s
4556 ; CHECK-GI-NEXT: mla v6.4s, v3.4s, v4.4s
4557 ; CHECK-GI-NEXT: add v3.4s, v16.4s, v16.4s
4558 ; CHECK-GI-NEXT: mla v7.4s, v18.4s, v26.4s
4559 ; CHECK-GI-NEXT: add v0.4s, v27.4s, v0.4s
4560 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v5.4s
4561 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v3.4s
4562 ; CHECK-GI-NEXT: add v2.4s, v6.4s, v7.4s
4563 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
4564 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
4565 ; CHECK-GI-NEXT: addv s0, v0.4s
4566 ; CHECK-GI-NEXT: fmov w8, s0
4567 ; CHECK-GI-NEXT: add w0, w8, w2
4568 ; CHECK-GI-NEXT: ret
4570 %0 = load <33 x i8>, ptr %a
4571 %1 = sext <33 x i8> %0 to <33 x i32>
4572 %2 = load <33 x i8>, ptr %b
4573 %3 = sext <33 x i8> %2 to <33 x i32>
4574 %4 = mul nsw <33 x i32> %3, %1
4575 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
4576 %op.extra = add nsw i32 %5, %sum
4580 define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
4581 ; CHECK-SD-LABEL: test_sdot_v33i8_double:
4582 ; CHECK-SD: // %bb.0: // %entry
4583 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
4584 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
4585 ; CHECK-SD-NEXT: .cfi_offset w29, -16
4586 ; CHECK-SD-NEXT: ldr b0, [sp, #344]
4587 ; CHECK-SD-NEXT: add x8, sp, #352
4588 ; CHECK-SD-NEXT: ldr b1, [sp, #80]
4589 ; CHECK-SD-NEXT: ldr b2, [sp, #216]
4590 ; CHECK-SD-NEXT: add x9, sp, #96
4591 ; CHECK-SD-NEXT: add x10, sp, #104
4592 ; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
4593 ; CHECK-SD-NEXT: add x8, sp, #88
4594 ; CHECK-SD-NEXT: ldr b4, [sp, #408]
4595 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
4596 ; CHECK-SD-NEXT: add x8, sp, #360
4597 ; CHECK-SD-NEXT: add x12, sp, #248
4598 ; CHECK-SD-NEXT: add x13, sp, #432
4599 ; CHECK-SD-NEXT: add x11, sp, #384
4600 ; CHECK-SD-NEXT: ldr b5, [sp, #144]
4601 ; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
4602 ; CHECK-SD-NEXT: add x8, sp, #224
4603 ; CHECK-SD-NEXT: ldr b6, [sp, #280]
4604 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
4605 ; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
4606 ; CHECK-SD-NEXT: add x8, sp, #368
4607 ; CHECK-SD-NEXT: add x9, sp, #232
4608 ; CHECK-SD-NEXT: ldr b16, [sp, #744]
4609 ; CHECK-SD-NEXT: ldr b17, [sp, #480]
4610 ; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8]
4611 ; CHECK-SD-NEXT: add x8, sp, #376
4612 ; CHECK-SD-NEXT: ldr b18, [sp, #936]
4613 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
4614 ; CHECK-SD-NEXT: ld1 { v1.b }[3], [x10]
4615 ; CHECK-SD-NEXT: add x9, sp, #240
4616 ; CHECK-SD-NEXT: add x10, sp, #392
4617 ; CHECK-SD-NEXT: ldr b19, [sp, #672]
4618 ; CHECK-SD-NEXT: ldr b7, [sp, #16]
4619 ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
4620 ; CHECK-SD-NEXT: add x8, sp, #112
4621 ; CHECK-SD-NEXT: ldr b21, [sp, #1000]
4622 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9]
4623 ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8]
4624 ; CHECK-SD-NEXT: add x8, sp, #416
4625 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8]
4626 ; CHECK-SD-NEXT: add x8, sp, #120
4627 ; CHECK-SD-NEXT: add x9, sp, #400
4628 ; CHECK-SD-NEXT: ld1 { v0.b }[5], [x11]
4629 ; CHECK-SD-NEXT: add x11, sp, #128
4630 ; CHECK-SD-NEXT: ldr b22, [sp, #736]
4631 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
4632 ; CHECK-SD-NEXT: add x12, sp, #424
4633 ; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
4634 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x12]
4635 ; CHECK-SD-NEXT: add x12, sp, #152
4636 ; CHECK-SD-NEXT: add x8, sp, #136
4637 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x12]
4638 ; CHECK-SD-NEXT: add x12, sp, #440
4639 ; CHECK-SD-NEXT: ld1 { v0.b }[6], [x10]
4640 ; CHECK-SD-NEXT: ld1 { v1.b }[6], [x11]
4641 ; CHECK-SD-NEXT: add x11, sp, #288
4642 ; CHECK-SD-NEXT: add x10, sp, #256
4643 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x13]
4644 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x11]
4645 ; CHECK-SD-NEXT: add x11, sp, #296
4646 ; CHECK-SD-NEXT: ld1 { v0.b }[7], [x9]
4647 ; CHECK-SD-NEXT: add x9, sp, #160
4648 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x10]
4649 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x9]
4650 ; CHECK-SD-NEXT: add x10, sp, #168
4651 ; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8]
4652 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x12]
4653 ; CHECK-SD-NEXT: add x12, sp, #448
4654 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x11]
4655 ; CHECK-SD-NEXT: add x11, sp, #304
4656 ; CHECK-SD-NEXT: add x8, sp, #464
4657 ; CHECK-SD-NEXT: add x13, sp, #768
4658 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x10]
4659 ; CHECK-SD-NEXT: add x10, sp, #176
4660 ; CHECK-SD-NEXT: add x9, sp, #264
4661 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x12]
4662 ; CHECK-SD-NEXT: add x12, sp, #456
4663 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x11]
4664 ; CHECK-SD-NEXT: add x11, sp, #760
4665 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9]
4666 ; CHECK-SD-NEXT: add x9, sp, #272
4667 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x10]
4668 ; CHECK-SD-NEXT: add x10, sp, #312
4669 ; CHECK-SD-NEXT: fmov s3, w0
4670 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x12]
4671 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x10]
4672 ; CHECK-SD-NEXT: add x10, sp, #320
4673 ; CHECK-SD-NEXT: add x12, sp, #680
4674 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x9]
4675 ; CHECK-SD-NEXT: add x9, sp, #184
4676 ; CHECK-SD-NEXT: ld1 { v19.b }[1], [x12]
4677 ; CHECK-SD-NEXT: add x12, sp, #776
4678 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x9]
4679 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8]
4680 ; CHECK-SD-NEXT: add x8, sp, #752
4681 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x10]
4682 ; CHECK-SD-NEXT: ld1 { v16.b }[1], [x8]
4683 ; CHECK-SD-NEXT: add x10, sp, #24
4684 ; CHECK-SD-NEXT: smull v22.8h, v22.8b, v21.8b
4685 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x10]
4686 ; CHECK-SD-NEXT: add x10, sp, #496
4687 ; CHECK-SD-NEXT: mov v3.b[1], w1
4688 ; CHECK-SD-NEXT: add x9, sp, #192
4689 ; CHECK-SD-NEXT: ldr b20, [sp, #472]
4690 ; CHECK-SD-NEXT: ldr b23, [sp, #208]
4691 ; CHECK-SD-NEXT: ld1 { v16.b }[2], [x11]
4692 ; CHECK-SD-NEXT: add x11, sp, #488
4693 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
4694 ; CHECK-SD-NEXT: ld1 { v17.b }[1], [x11]
4695 ; CHECK-SD-NEXT: add x11, sp, #944
4696 ; CHECK-SD-NEXT: add x9, sp, #328
4697 ; CHECK-SD-NEXT: ld1 { v18.b }[1], [x11]
4698 ; CHECK-SD-NEXT: add x11, sp, #688
4699 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x9]
4700 ; CHECK-SD-NEXT: ld1 { v16.b }[3], [x13]
4701 ; CHECK-SD-NEXT: ld1 { v19.b }[2], [x11]
4702 ; CHECK-SD-NEXT: add x11, sp, #504
4703 ; CHECK-SD-NEXT: ld1 { v17.b }[2], [x10]
4704 ; CHECK-SD-NEXT: add x10, sp, #952
4705 ; CHECK-SD-NEXT: add x13, sp, #784
4706 ; CHECK-SD-NEXT: ld1 { v18.b }[2], [x10]
4707 ; CHECK-SD-NEXT: add x10, sp, #32
4708 ; CHECK-SD-NEXT: add x9, sp, #40
4709 ; CHECK-SD-NEXT: ld1 { v16.b }[4], [x12]
4710 ; CHECK-SD-NEXT: add x12, sp, #696
4711 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10]
4712 ; CHECK-SD-NEXT: ld1 { v17.b }[3], [x11]
4713 ; CHECK-SD-NEXT: add x11, sp, #960
4714 ; CHECK-SD-NEXT: ld1 { v19.b }[3], [x12]
4715 ; CHECK-SD-NEXT: ld1 { v18.b }[3], [x11]
4716 ; CHECK-SD-NEXT: add x10, sp, #512
4717 ; CHECK-SD-NEXT: add x11, sp, #704
4718 ; CHECK-SD-NEXT: ld1 { v16.b }[5], [x13]
4719 ; CHECK-SD-NEXT: add x12, sp, #792
4720 ; CHECK-SD-NEXT: sshll v24.4s, v22.4h, #0
4721 ; CHECK-SD-NEXT: ld1 { v17.b }[4], [x10]
4722 ; CHECK-SD-NEXT: add x10, sp, #968
4723 ; CHECK-SD-NEXT: ld1 { v19.b }[4], [x11]
4724 ; CHECK-SD-NEXT: ld1 { v18.b }[4], [x10]
4725 ; CHECK-SD-NEXT: add x10, sp, #520
4726 ; CHECK-SD-NEXT: add x11, sp, #976
4727 ; CHECK-SD-NEXT: ld1 { v16.b }[6], [x12]
4728 ; CHECK-SD-NEXT: add x12, sp, #712
4729 ; CHECK-SD-NEXT: smull v20.8h, v23.8b, v20.8b
4730 ; CHECK-SD-NEXT: ld1 { v17.b }[5], [x10]
4731 ; CHECK-SD-NEXT: ld1 { v19.b }[5], [x12]
4732 ; CHECK-SD-NEXT: add x12, sp, #720
4733 ; CHECK-SD-NEXT: ld1 { v18.b }[5], [x11]
4734 ; CHECK-SD-NEXT: add x11, sp, #528
4735 ; CHECK-SD-NEXT: add x10, sp, #800
4736 ; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10]
4737 ; CHECK-SD-NEXT: add x10, sp, #536
4738 ; CHECK-SD-NEXT: ldr b22, [sp, #872]
4739 ; CHECK-SD-NEXT: ld1 { v17.b }[6], [x11]
4740 ; CHECK-SD-NEXT: add x11, sp, #984
4741 ; CHECK-SD-NEXT: ld1 { v19.b }[6], [x12]
4742 ; CHECK-SD-NEXT: ld1 { v18.b }[6], [x11]
4743 ; CHECK-SD-NEXT: add x11, sp, #992
4744 ; CHECK-SD-NEXT: add x12, sp, #728
4745 ; CHECK-SD-NEXT: ldr b23, [sp, #608]
4746 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
4747 ; CHECK-SD-NEXT: add x9, sp, #880
4748 ; CHECK-SD-NEXT: ld1 { v17.b }[7], [x10]
4749 ; CHECK-SD-NEXT: ld1 { v19.b }[7], [x12]
4750 ; CHECK-SD-NEXT: add x10, sp, #816
4751 ; CHECK-SD-NEXT: ld1 { v18.b }[7], [x11]
4752 ; CHECK-SD-NEXT: add x11, sp, #552
4753 ; CHECK-SD-NEXT: add x12, sp, #616
4754 ; CHECK-SD-NEXT: mov v3.b[2], w2
4755 ; CHECK-SD-NEXT: ld1 { v22.b }[1], [x9]
4756 ; CHECK-SD-NEXT: ld1 { v23.b }[1], [x12]
4757 ; CHECK-SD-NEXT: smull v16.8h, v17.8b, v16.8b
4758 ; CHECK-SD-NEXT: add x12, sp, #560
4759 ; CHECK-SD-NEXT: add x9, sp, #888
4760 ; CHECK-SD-NEXT: smull v17.8h, v19.8b, v18.8b
4761 ; CHECK-SD-NEXT: ldr b18, [sp, #808]
4762 ; CHECK-SD-NEXT: ldr b19, [sp, #544]
4763 ; CHECK-SD-NEXT: add x13, sp, #624
4764 ; CHECK-SD-NEXT: ld1 { v22.b }[2], [x9]
4765 ; CHECK-SD-NEXT: add x9, sp, #896
4766 ; CHECK-SD-NEXT: ld1 { v18.b }[1], [x10]
4767 ; CHECK-SD-NEXT: ld1 { v19.b }[1], [x11]
4768 ; CHECK-SD-NEXT: add x11, sp, #824
4769 ; CHECK-SD-NEXT: add x10, sp, #48
4770 ; CHECK-SD-NEXT: ld1 { v23.b }[2], [x13]
4771 ; CHECK-SD-NEXT: mov v3.b[3], w3
4772 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
4773 ; CHECK-SD-NEXT: add x10, sp, #832
4774 ; CHECK-SD-NEXT: ld1 { v22.b }[3], [x9]
4775 ; CHECK-SD-NEXT: ld1 { v18.b }[2], [x11]
4776 ; CHECK-SD-NEXT: ld1 { v19.b }[2], [x12]
4777 ; CHECK-SD-NEXT: add x11, sp, #568
4778 ; CHECK-SD-NEXT: add x12, sp, #632
4779 ; CHECK-SD-NEXT: add x9, sp, #904
4780 ; CHECK-SD-NEXT: add x13, sp, #640
4781 ; CHECK-SD-NEXT: ld1 { v23.b }[3], [x12]
4782 ; CHECK-SD-NEXT: add x12, sp, #576
4783 ; CHECK-SD-NEXT: mov v3.b[4], w4
4784 ; CHECK-SD-NEXT: ld1 { v18.b }[3], [x10]
4785 ; CHECK-SD-NEXT: ld1 { v19.b }[3], [x11]
4786 ; CHECK-SD-NEXT: add x11, sp, #840
4787 ; CHECK-SD-NEXT: add x10, sp, #56
4788 ; CHECK-SD-NEXT: ld1 { v22.b }[4], [x9]
4789 ; CHECK-SD-NEXT: add x9, sp, #912
4790 ; CHECK-SD-NEXT: ld1 { v23.b }[4], [x13]
4791 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x10]
4792 ; CHECK-SD-NEXT: add x10, sp, #848
4793 ; CHECK-SD-NEXT: ld1 { v18.b }[4], [x11]
4794 ; CHECK-SD-NEXT: ld1 { v19.b }[4], [x12]
4795 ; CHECK-SD-NEXT: add x11, sp, #584
4796 ; CHECK-SD-NEXT: add x12, sp, #648
4797 ; CHECK-SD-NEXT: mov v3.b[5], w5
4798 ; CHECK-SD-NEXT: ld1 { v22.b }[5], [x9]
4799 ; CHECK-SD-NEXT: ld1 { v23.b }[5], [x12]
4800 ; CHECK-SD-NEXT: add x12, sp, #592
4801 ; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
4802 ; CHECK-SD-NEXT: ld1 { v18.b }[5], [x10]
4803 ; CHECK-SD-NEXT: ld1 { v19.b }[5], [x11]
4804 ; CHECK-SD-NEXT: add x11, sp, #856
4805 ; CHECK-SD-NEXT: add x9, sp, #920
4806 ; CHECK-SD-NEXT: add x13, sp, #656
4807 ; CHECK-SD-NEXT: add x10, sp, #64
4808 ; CHECK-SD-NEXT: ld1 { v22.b }[6], [x9]
4809 ; CHECK-SD-NEXT: ld1 { v23.b }[6], [x13]
4810 ; CHECK-SD-NEXT: mov v3.b[6], w6
4811 ; CHECK-SD-NEXT: ld1 { v18.b }[6], [x11]
4812 ; CHECK-SD-NEXT: ld1 { v19.b }[6], [x12]
4813 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
4814 ; CHECK-SD-NEXT: add x10, sp, #864
4815 ; CHECK-SD-NEXT: add x11, sp, #600
4816 ; CHECK-SD-NEXT: add x9, sp, #928
4817 ; CHECK-SD-NEXT: add x12, sp, #664
4818 ; CHECK-SD-NEXT: mov v21.s[0], v24.s[0]
4819 ; CHECK-SD-NEXT: ld1 { v22.b }[7], [x9]
4820 ; CHECK-SD-NEXT: ld1 { v18.b }[7], [x10]
4821 ; CHECK-SD-NEXT: ld1 { v19.b }[7], [x11]
4822 ; CHECK-SD-NEXT: ld1 { v23.b }[7], [x12]
4823 ; CHECK-SD-NEXT: add x8, sp, #200
4824 ; CHECK-SD-NEXT: mov v3.b[7], w7
4825 ; CHECK-SD-NEXT: add x10, sp, #336
4826 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8]
4827 ; CHECK-SD-NEXT: add x8, sp, #72
4828 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x10]
4829 ; CHECK-SD-NEXT: smull v18.8h, v19.8b, v18.8b
4830 ; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
4831 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8]
4832 ; CHECK-SD-NEXT: smull v22.8h, v23.8b, v22.8b
4833 ; CHECK-SD-NEXT: sshll v20.4s, v20.4h, #0
4834 ; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
4835 ; CHECK-SD-NEXT: saddw v1.4s, v21.4s, v16.4h
4836 ; CHECK-SD-NEXT: smull v2.8h, v3.8b, v2.8b
4837 ; CHECK-SD-NEXT: smull v3.8h, v5.8b, v4.8b
4838 ; CHECK-SD-NEXT: smull v4.8h, v7.8b, v6.8b
4839 ; CHECK-SD-NEXT: mov v19.s[0], v20.s[0]
4840 ; CHECK-SD-NEXT: saddl2 v5.4s, v18.8h, v17.8h
4841 ; CHECK-SD-NEXT: saddl v7.4s, v18.4h, v17.4h
4842 ; CHECK-SD-NEXT: saddl2 v6.4s, v16.8h, v22.8h
4843 ; CHECK-SD-NEXT: saddw v1.4s, v1.4s, v22.4h
4844 ; CHECK-SD-NEXT: saddl2 v17.4s, v2.8h, v0.8h
4845 ; CHECK-SD-NEXT: saddl2 v16.4s, v4.8h, v3.8h
4846 ; CHECK-SD-NEXT: saddl v3.4s, v4.4h, v3.4h
4847 ; CHECK-SD-NEXT: saddw v2.4s, v19.4s, v2.4h
4848 ; CHECK-SD-NEXT: add v5.4s, v6.4s, v5.4s
4849 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v7.4s
4850 ; CHECK-SD-NEXT: add v6.4s, v17.4s, v16.4s
4851 ; CHECK-SD-NEXT: saddw v0.4s, v2.4s, v0.4h
4852 ; CHECK-SD-NEXT: add v1.4s, v1.4s, v5.4s
4853 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v3.4s
4854 ; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
4855 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
4856 ; CHECK-SD-NEXT: addv s0, v0.4s
4857 ; CHECK-SD-NEXT: fmov w0, s0
4858 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
4859 ; CHECK-SD-NEXT: ret
4861 ; CHECK-GI-LABEL: test_sdot_v33i8_double:
4862 ; CHECK-GI: // %bb.0: // %entry
4863 ; CHECK-GI-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
4864 ; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
4865 ; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
4866 ; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
4867 ; CHECK-GI-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
4868 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
4869 ; CHECK-GI-NEXT: .cfi_offset w29, -16
4870 ; CHECK-GI-NEXT: .cfi_offset b8, -24
4871 ; CHECK-GI-NEXT: .cfi_offset b9, -32
4872 ; CHECK-GI-NEXT: .cfi_offset b10, -40
4873 ; CHECK-GI-NEXT: .cfi_offset b11, -48
4874 ; CHECK-GI-NEXT: .cfi_offset b12, -56
4875 ; CHECK-GI-NEXT: .cfi_offset b13, -64
4876 ; CHECK-GI-NEXT: .cfi_offset b14, -72
4877 ; CHECK-GI-NEXT: .cfi_offset b15, -80
4878 ; CHECK-GI-NEXT: ldr w8, [sp, #80]
4879 ; CHECK-GI-NEXT: sxtb w9, w0
4880 ; CHECK-GI-NEXT: ldr w10, [sp, #112]
4881 ; CHECK-GI-NEXT: sxtb w11, w4
4882 ; CHECK-GI-NEXT: sxtb w13, w7
4883 ; CHECK-GI-NEXT: sxtb w12, w3
4884 ; CHECK-GI-NEXT: sxtb w8, w8
4885 ; CHECK-GI-NEXT: mov v0.s[0], w9
4886 ; CHECK-GI-NEXT: sxtb w9, w10
4887 ; CHECK-GI-NEXT: mov v3.s[0], w11
4888 ; CHECK-GI-NEXT: sxtb w10, w1
4889 ; CHECK-GI-NEXT: sxtb w11, w5
4890 ; CHECK-GI-NEXT: mov v1.s[0], w8
4891 ; CHECK-GI-NEXT: ldr w8, [sp, #88]
4892 ; CHECK-GI-NEXT: mov v5.s[0], w9
4893 ; CHECK-GI-NEXT: ldr w9, [sp, #120]
4894 ; CHECK-GI-NEXT: ldr w14, [sp, #168]
4895 ; CHECK-GI-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
4896 ; CHECK-GI-NEXT: sxtb w8, w8
4897 ; CHECK-GI-NEXT: mov v0.s[1], w10
4898 ; CHECK-GI-NEXT: sxtb w10, w2
4899 ; CHECK-GI-NEXT: mov v3.s[1], w11
4900 ; CHECK-GI-NEXT: sxtb w9, w9
4901 ; CHECK-GI-NEXT: sxtb w11, w6
4902 ; CHECK-GI-NEXT: mov v1.s[1], w8
4903 ; CHECK-GI-NEXT: ldr w8, [sp, #96]
4904 ; CHECK-GI-NEXT: mov v5.s[1], w9
4905 ; CHECK-GI-NEXT: ldr w9, [sp, #128]
4906 ; CHECK-GI-NEXT: sxtb w8, w8
4907 ; CHECK-GI-NEXT: mov v0.s[2], w10
4908 ; CHECK-GI-NEXT: ldr w10, [sp, #104]
4909 ; CHECK-GI-NEXT: mov v3.s[2], w11
4910 ; CHECK-GI-NEXT: sxtb w9, w9
4911 ; CHECK-GI-NEXT: ldr w11, [sp, #136]
4912 ; CHECK-GI-NEXT: mov v1.s[2], w8
4913 ; CHECK-GI-NEXT: ldr w8, [sp, #144]
4914 ; CHECK-GI-NEXT: sxtb w10, w10
4915 ; CHECK-GI-NEXT: mov v5.s[2], w9
4916 ; CHECK-GI-NEXT: sxtb w11, w11
4917 ; CHECK-GI-NEXT: ldr w9, [sp, #152]
4918 ; CHECK-GI-NEXT: sxtb w8, w8
4919 ; CHECK-GI-NEXT: mov v0.s[3], w12
4920 ; CHECK-GI-NEXT: ldr w12, [sp, #160]
4921 ; CHECK-GI-NEXT: mov v3.s[3], w13
4922 ; CHECK-GI-NEXT: ldr w13, [sp, #176]
4923 ; CHECK-GI-NEXT: sxtb w9, w9
4924 ; CHECK-GI-NEXT: mov v1.s[3], w10
4925 ; CHECK-GI-NEXT: ldr w10, [sp, #208]
4926 ; CHECK-GI-NEXT: mov v2.s[0], w8
4927 ; CHECK-GI-NEXT: mov v5.s[3], w11
4928 ; CHECK-GI-NEXT: ldr w11, [sp, #240]
4929 ; CHECK-GI-NEXT: sxtb w13, w13
4930 ; CHECK-GI-NEXT: sxtb w8, w10
4931 ; CHECK-GI-NEXT: ldr w10, [sp, #184]
4932 ; CHECK-GI-NEXT: sxtb w12, w12
4933 ; CHECK-GI-NEXT: mov v6.s[0], w13
4934 ; CHECK-GI-NEXT: sxtb w11, w11
4935 ; CHECK-GI-NEXT: ldr w13, [sp, #264]
4936 ; CHECK-GI-NEXT: mov v4.s[0], w8
4937 ; CHECK-GI-NEXT: ldr w8, [sp, #216]
4938 ; CHECK-GI-NEXT: sxtb w10, w10
4939 ; CHECK-GI-NEXT: mov v7.s[0], w11
4940 ; CHECK-GI-NEXT: ldr w11, [sp, #248]
4941 ; CHECK-GI-NEXT: mov v2.s[1], w9
4942 ; CHECK-GI-NEXT: sxtb w8, w8
4943 ; CHECK-GI-NEXT: ldr w9, [sp, #192]
4944 ; CHECK-GI-NEXT: mov v6.s[1], w10
4945 ; CHECK-GI-NEXT: sxtb w11, w11
4946 ; CHECK-GI-NEXT: ldr w10, [sp, #200]
4947 ; CHECK-GI-NEXT: mov v4.s[1], w8
4948 ; CHECK-GI-NEXT: ldr w8, [sp, #224]
4949 ; CHECK-GI-NEXT: sxtb w9, w9
4950 ; CHECK-GI-NEXT: mov v7.s[1], w11
4951 ; CHECK-GI-NEXT: ldr w11, [sp, #256]
4952 ; CHECK-GI-NEXT: mov v2.s[2], w12
4953 ; CHECK-GI-NEXT: sxtb w8, w8
4954 ; CHECK-GI-NEXT: ldr w12, [sp, #232]
4955 ; CHECK-GI-NEXT: sxtb w10, w10
4956 ; CHECK-GI-NEXT: mov v6.s[2], w9
4957 ; CHECK-GI-NEXT: sxtb w9, w11
4958 ; CHECK-GI-NEXT: sxtb w11, w14
4959 ; CHECK-GI-NEXT: mov v4.s[2], w8
4960 ; CHECK-GI-NEXT: ldr w14, [sp, #280]
4961 ; CHECK-GI-NEXT: ldr w8, [sp, #272]
4962 ; CHECK-GI-NEXT: mov v7.s[2], w9
4963 ; CHECK-GI-NEXT: mov v2.s[3], w11
4964 ; CHECK-GI-NEXT: sxtb w11, w12
4965 ; CHECK-GI-NEXT: sxtb w12, w13
4966 ; CHECK-GI-NEXT: sxtb w13, w14
4967 ; CHECK-GI-NEXT: ldr w9, [sp, #288]
4968 ; CHECK-GI-NEXT: mov v6.s[3], w10
4969 ; CHECK-GI-NEXT: ldr w10, [sp, #312]
4970 ; CHECK-GI-NEXT: ldr w14, [sp, #544]
4971 ; CHECK-GI-NEXT: mov v4.s[3], w11
4972 ; CHECK-GI-NEXT: ldr w11, [sp, #344]
4973 ; CHECK-GI-NEXT: mov v16.s[0], w13
4974 ; CHECK-GI-NEXT: mov v7.s[3], w12
4975 ; CHECK-GI-NEXT: ldr w12, [sp, #376]
4976 ; CHECK-GI-NEXT: sxtb w10, w10
4977 ; CHECK-GI-NEXT: sxtb w11, w11
4978 ; CHECK-GI-NEXT: sxtb w9, w9
4979 ; CHECK-GI-NEXT: ldr w13, [sp, #296]
4980 ; CHECK-GI-NEXT: mov v19.s[0], w10
4981 ; CHECK-GI-NEXT: ldr w10, [sp, #320]
4982 ; CHECK-GI-NEXT: sxtb w12, w12
4983 ; CHECK-GI-NEXT: mov v17.s[0], w11
4984 ; CHECK-GI-NEXT: ldr w11, [sp, #352]
4985 ; CHECK-GI-NEXT: mov v16.s[1], w9
4986 ; CHECK-GI-NEXT: mov v21.s[0], w12
4987 ; CHECK-GI-NEXT: ldr w12, [sp, #384]
4988 ; CHECK-GI-NEXT: sxtb w10, w10
4989 ; CHECK-GI-NEXT: sxtb w11, w11
4990 ; CHECK-GI-NEXT: sxtb w13, w13
4991 ; CHECK-GI-NEXT: ldr w9, [sp, #304]
4992 ; CHECK-GI-NEXT: mov v19.s[1], w10
4993 ; CHECK-GI-NEXT: ldr w10, [sp, #328]
4994 ; CHECK-GI-NEXT: sxtb w12, w12
4995 ; CHECK-GI-NEXT: mov v17.s[1], w11
4996 ; CHECK-GI-NEXT: ldr w11, [sp, #360]
4997 ; CHECK-GI-NEXT: mov v16.s[2], w13
4998 ; CHECK-GI-NEXT: mov v21.s[1], w12
4999 ; CHECK-GI-NEXT: ldr w12, [sp, #392]
5000 ; CHECK-GI-NEXT: sxtb w10, w10
5001 ; CHECK-GI-NEXT: sxtb w11, w11
5002 ; CHECK-GI-NEXT: ldr w13, [sp, #336]
5003 ; CHECK-GI-NEXT: sxtb w9, w9
5004 ; CHECK-GI-NEXT: mov v19.s[2], w10
5005 ; CHECK-GI-NEXT: ldr w10, [sp, #368]
5006 ; CHECK-GI-NEXT: sxtb w12, w12
5007 ; CHECK-GI-NEXT: mov v17.s[2], w11
5008 ; CHECK-GI-NEXT: sxtb w13, w13
5009 ; CHECK-GI-NEXT: ldr w11, [sp, #400]
5010 ; CHECK-GI-NEXT: mov v21.s[2], w12
5011 ; CHECK-GI-NEXT: ldr w12, [sp, #408]
5012 ; CHECK-GI-NEXT: sxtb w10, w10
5013 ; CHECK-GI-NEXT: sxtb w11, w11
5014 ; CHECK-GI-NEXT: mov v16.s[3], w9
5015 ; CHECK-GI-NEXT: ldr w9, [sp, #416]
5016 ; CHECK-GI-NEXT: mov v19.s[3], w13
5017 ; CHECK-GI-NEXT: ldr w13, [sp, #440]
5018 ; CHECK-GI-NEXT: sxtb w12, w12
5019 ; CHECK-GI-NEXT: mov v17.s[3], w10
5020 ; CHECK-GI-NEXT: ldr w10, [sp, #472]
5021 ; CHECK-GI-NEXT: sxtb w9, w9
5022 ; CHECK-GI-NEXT: sxtb w13, w13
5023 ; CHECK-GI-NEXT: mov v18.s[0], w12
5024 ; CHECK-GI-NEXT: mov v21.s[3], w11
5025 ; CHECK-GI-NEXT: sxtb w10, w10
5026 ; CHECK-GI-NEXT: ldr w11, [sp, #504]
5027 ; CHECK-GI-NEXT: ldr w12, [sp, #424]
5028 ; CHECK-GI-NEXT: mov v22.s[0], w13
5029 ; CHECK-GI-NEXT: ldr w13, [sp, #448]
5030 ; CHECK-GI-NEXT: sxtb w8, w8
5031 ; CHECK-GI-NEXT: mov v20.s[0], w10
5032 ; CHECK-GI-NEXT: ldr w10, [sp, #480]
5033 ; CHECK-GI-NEXT: sxtb w11, w11
5034 ; CHECK-GI-NEXT: sxtb w13, w13
5035 ; CHECK-GI-NEXT: mov v18.s[1], w9
5036 ; CHECK-GI-NEXT: sxtb w12, w12
5037 ; CHECK-GI-NEXT: sxtb w10, w10
5038 ; CHECK-GI-NEXT: mov v23.s[0], w11
5039 ; CHECK-GI-NEXT: ldr w11, [sp, #512]
5040 ; CHECK-GI-NEXT: mov v22.s[1], w13
5041 ; CHECK-GI-NEXT: ldr w13, [sp, #456]
5042 ; CHECK-GI-NEXT: ldr w9, [sp, #432]
5043 ; CHECK-GI-NEXT: mov v20.s[1], w10
5044 ; CHECK-GI-NEXT: ldr w10, [sp, #488]
5045 ; CHECK-GI-NEXT: sxtb w11, w11
5046 ; CHECK-GI-NEXT: sxtb w13, w13
5047 ; CHECK-GI-NEXT: mov v18.s[2], w12
5048 ; CHECK-GI-NEXT: ldr w12, [sp, #464]
5049 ; CHECK-GI-NEXT: sxtb w10, w10
5050 ; CHECK-GI-NEXT: mov v23.s[1], w11
5051 ; CHECK-GI-NEXT: ldr w11, [sp, #520]
5052 ; CHECK-GI-NEXT: mov v22.s[2], w13
5053 ; CHECK-GI-NEXT: ldr w13, [sp, #496]
5054 ; CHECK-GI-NEXT: sxtb w9, w9
5055 ; CHECK-GI-NEXT: mov v20.s[2], w10
5056 ; CHECK-GI-NEXT: sxtb w11, w11
5057 ; CHECK-GI-NEXT: sxtb w12, w12
5058 ; CHECK-GI-NEXT: mov v18.s[3], w9
5059 ; CHECK-GI-NEXT: sxtb w9, w13
5060 ; CHECK-GI-NEXT: ldr w10, [sp, #528]
5061 ; CHECK-GI-NEXT: mov v23.s[2], w11
5062 ; CHECK-GI-NEXT: ldr w11, [sp, #536]
5063 ; CHECK-GI-NEXT: sxtb w13, w14
5064 ; CHECK-GI-NEXT: mov v22.s[3], w12
5065 ; CHECK-GI-NEXT: ldr w12, [sp, #576]
5066 ; CHECK-GI-NEXT: sxtb w10, w10
5067 ; CHECK-GI-NEXT: mov v20.s[3], w9
5068 ; CHECK-GI-NEXT: ldr w9, [sp, #608]
5069 ; CHECK-GI-NEXT: sxtb w11, w11
5070 ; CHECK-GI-NEXT: sxtb w12, w12
5071 ; CHECK-GI-NEXT: mov v24.s[0], w13
5072 ; CHECK-GI-NEXT: ldr w13, [sp, #560]
5073 ; CHECK-GI-NEXT: sxtb w9, w9
5074 ; CHECK-GI-NEXT: mul w8, w8, w11
5075 ; CHECK-GI-NEXT: mov v23.s[3], w10
5076 ; CHECK-GI-NEXT: ldr w10, [sp, #552]
5077 ; CHECK-GI-NEXT: ldr w11, [sp, #584]
5078 ; CHECK-GI-NEXT: mov v27.s[0], w12
5079 ; CHECK-GI-NEXT: mov v26.s[0], w9
5080 ; CHECK-GI-NEXT: ldr w9, [sp, #616]
5081 ; CHECK-GI-NEXT: sxtb w13, w13
5082 ; CHECK-GI-NEXT: sxtb w10, w10
5083 ; CHECK-GI-NEXT: sxtb w11, w11
5084 ; CHECK-GI-NEXT: mov v25.s[0], w8
5085 ; CHECK-GI-NEXT: ldr w8, [sp, #640]
5086 ; CHECK-GI-NEXT: sxtb w9, w9
5087 ; CHECK-GI-NEXT: ldr w12, [sp, #568]
5088 ; CHECK-GI-NEXT: mov v24.s[1], w10
5089 ; CHECK-GI-NEXT: ldr w10, [sp, #592]
5090 ; CHECK-GI-NEXT: mov v27.s[1], w11
5091 ; CHECK-GI-NEXT: mov v26.s[1], w9
5092 ; CHECK-GI-NEXT: ldr w9, [sp, #624]
5093 ; CHECK-GI-NEXT: sxtb w8, w8
5094 ; CHECK-GI-NEXT: sxtb w10, w10
5095 ; CHECK-GI-NEXT: ldr w11, [sp, #600]
5096 ; CHECK-GI-NEXT: sxtb w12, w12
5097 ; CHECK-GI-NEXT: mov v28.s[0], w8
5098 ; CHECK-GI-NEXT: ldr w8, [sp, #648]
5099 ; CHECK-GI-NEXT: sxtb w9, w9
5100 ; CHECK-GI-NEXT: mov v24.s[2], w13
5101 ; CHECK-GI-NEXT: ldr w13, [sp, #632]
5102 ; CHECK-GI-NEXT: mov v27.s[2], w10
5103 ; CHECK-GI-NEXT: mov v26.s[2], w9
5104 ; CHECK-GI-NEXT: sxtb w8, w8
5105 ; CHECK-GI-NEXT: ldr w10, [sp, #656]
5106 ; CHECK-GI-NEXT: sxtb w11, w11
5107 ; CHECK-GI-NEXT: sxtb w13, w13
5108 ; CHECK-GI-NEXT: ldr w9, [sp, #664]
5109 ; CHECK-GI-NEXT: mov v28.s[1], w8
5110 ; CHECK-GI-NEXT: sxtb w10, w10
5111 ; CHECK-GI-NEXT: ldr w8, [sp, #680]
5112 ; CHECK-GI-NEXT: mov v24.s[3], w12
5113 ; CHECK-GI-NEXT: ldr w12, [sp, #672]
5114 ; CHECK-GI-NEXT: mov v27.s[3], w11
5115 ; CHECK-GI-NEXT: ldr w11, [sp, #704]
5116 ; CHECK-GI-NEXT: mov v26.s[3], w13
5117 ; CHECK-GI-NEXT: ldr w13, [sp, #736]
5118 ; CHECK-GI-NEXT: sxtb w12, w12
5119 ; CHECK-GI-NEXT: sxtb w9, w9
5120 ; CHECK-GI-NEXT: sxtb w8, w8
5121 ; CHECK-GI-NEXT: sxtb w11, w11
5122 ; CHECK-GI-NEXT: mov v28.s[2], w10
5123 ; CHECK-GI-NEXT: sxtb w13, w13
5124 ; CHECK-GI-NEXT: mov v29.s[0], w12
5125 ; CHECK-GI-NEXT: ldr w12, [sp, #688]
5126 ; CHECK-GI-NEXT: ldr w10, [sp, #696]
5127 ; CHECK-GI-NEXT: mov v31.s[0], w11
5128 ; CHECK-GI-NEXT: ldr w11, [sp, #712]
5129 ; CHECK-GI-NEXT: mov v30.s[0], w13
5130 ; CHECK-GI-NEXT: ldr w13, [sp, #744]
5131 ; CHECK-GI-NEXT: sxtb w12, w12
5132 ; CHECK-GI-NEXT: ldr w14, [sp, #776]
5133 ; CHECK-GI-NEXT: sxtb w11, w11
5134 ; CHECK-GI-NEXT: mov v28.s[3], w9
5135 ; CHECK-GI-NEXT: ldr w9, [sp, #768]
5136 ; CHECK-GI-NEXT: sxtb w13, w13
5137 ; CHECK-GI-NEXT: mov v29.s[1], w8
5138 ; CHECK-GI-NEXT: ldr w8, [sp, #720]
5139 ; CHECK-GI-NEXT: mov v31.s[1], w11
5140 ; CHECK-GI-NEXT: sxtb w9, w9
5141 ; CHECK-GI-NEXT: ldr w11, [sp, #728]
5142 ; CHECK-GI-NEXT: mov v30.s[1], w13
5143 ; CHECK-GI-NEXT: ldr w13, [sp, #752]
5144 ; CHECK-GI-NEXT: sxtb w8, w8
5145 ; CHECK-GI-NEXT: mov v8.s[0], w9
5146 ; CHECK-GI-NEXT: sxtb w11, w11
5147 ; CHECK-GI-NEXT: mul v3.4s, v3.4s, v19.4s
5148 ; CHECK-GI-NEXT: sxtb w9, w13
5149 ; CHECK-GI-NEXT: mov v29.s[2], w12
5150 ; CHECK-GI-NEXT: ldr w12, [sp, #760]
5151 ; CHECK-GI-NEXT: mov v31.s[2], w8
5152 ; CHECK-GI-NEXT: sxtb w8, w10
5153 ; CHECK-GI-NEXT: sxtb w10, w14
5154 ; CHECK-GI-NEXT: mov v30.s[2], w9
5155 ; CHECK-GI-NEXT: ldr w14, [sp, #808]
5156 ; CHECK-GI-NEXT: ldr w13, [sp, #784]
5157 ; CHECK-GI-NEXT: mov v8.s[1], w10
5158 ; CHECK-GI-NEXT: sxtb w10, w12
5159 ; CHECK-GI-NEXT: ldr w9, [sp, #792]
5160 ; CHECK-GI-NEXT: sxtb w12, w14
5161 ; CHECK-GI-NEXT: sxtb w13, w13
5162 ; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s
5163 ; CHECK-GI-NEXT: mov v31.s[3], w11
5164 ; CHECK-GI-NEXT: ldr w11, [sp, #840]
5165 ; CHECK-GI-NEXT: sxtb w9, w9
5166 ; CHECK-GI-NEXT: mov v30.s[3], w10
5167 ; CHECK-GI-NEXT: ldr w10, [sp, #872]
5168 ; CHECK-GI-NEXT: mov v9.s[0], w12
5169 ; CHECK-GI-NEXT: ldr w12, [sp, #816]
5170 ; CHECK-GI-NEXT: sxtb w11, w11
5171 ; CHECK-GI-NEXT: mov v8.s[2], w13
5172 ; CHECK-GI-NEXT: sxtb w10, w10
5173 ; CHECK-GI-NEXT: ldr w13, [sp, #824]
5174 ; CHECK-GI-NEXT: mov v21.s[0], wzr
5175 ; CHECK-GI-NEXT: sxtb w12, w12
5176 ; CHECK-GI-NEXT: mov v11.s[0], w11
5177 ; CHECK-GI-NEXT: ldr w11, [sp, #848]
5178 ; CHECK-GI-NEXT: mov v10.s[0], w10
5179 ; CHECK-GI-NEXT: ldr w10, [sp, #880]
5180 ; CHECK-GI-NEXT: sxtb w13, w13
5181 ; CHECK-GI-NEXT: mov v9.s[1], w12
5182 ; CHECK-GI-NEXT: sxtb w11, w11
5183 ; CHECK-GI-NEXT: mov v8.s[3], w9
5184 ; CHECK-GI-NEXT: ldr w9, [sp, #904]
5185 ; CHECK-GI-NEXT: sxtb w10, w10
5186 ; CHECK-GI-NEXT: ldr w12, [sp, #832]
5187 ; CHECK-GI-NEXT: mov v11.s[1], w11
5188 ; CHECK-GI-NEXT: ldr w11, [sp, #856]
5189 ; CHECK-GI-NEXT: mov v29.s[3], w8
5190 ; CHECK-GI-NEXT: mov v10.s[1], w10
5191 ; CHECK-GI-NEXT: ldr w10, [sp, #888]
5192 ; CHECK-GI-NEXT: sxtb w9, w9
5193 ; CHECK-GI-NEXT: mov v9.s[2], w13
5194 ; CHECK-GI-NEXT: sxtb w11, w11
5195 ; CHECK-GI-NEXT: sxtb w12, w12
5196 ; CHECK-GI-NEXT: mov v12.s[0], w9
5197 ; CHECK-GI-NEXT: ldr w9, [sp, #912]
5198 ; CHECK-GI-NEXT: sxtb w10, w10
5199 ; CHECK-GI-NEXT: mov v11.s[2], w11
5200 ; CHECK-GI-NEXT: ldr w11, [sp, #896]
5201 ; CHECK-GI-NEXT: ldr w13, [sp, #864]
5202 ; CHECK-GI-NEXT: mov v10.s[2], w10
5203 ; CHECK-GI-NEXT: sxtb w9, w9
5204 ; CHECK-GI-NEXT: ldr w10, [sp, #920]
5205 ; CHECK-GI-NEXT: mov v9.s[3], w12
5206 ; CHECK-GI-NEXT: ldr w12, [sp, #968]
5207 ; CHECK-GI-NEXT: sxtb w11, w11
5208 ; CHECK-GI-NEXT: mov v12.s[1], w9
5209 ; CHECK-GI-NEXT: sxtb w10, w10
5210 ; CHECK-GI-NEXT: sxtb w13, w13
5211 ; CHECK-GI-NEXT: sxtb w12, w12
5212 ; CHECK-GI-NEXT: ldr w8, [sp, #800]
5213 ; CHECK-GI-NEXT: ldr w9, [sp, #928]
5214 ; CHECK-GI-NEXT: mov v10.s[3], w11
5215 ; CHECK-GI-NEXT: ldr w11, [sp, #1032]
5216 ; CHECK-GI-NEXT: mov v11.s[3], w13
5217 ; CHECK-GI-NEXT: mov v14.s[0], w12
5218 ; CHECK-GI-NEXT: ldr w12, [sp, #976]
5219 ; CHECK-GI-NEXT: ldr w13, [sp, #936]
5220 ; CHECK-GI-NEXT: mov v12.s[2], w10
5221 ; CHECK-GI-NEXT: ldr w10, [sp, #1000]
5222 ; CHECK-GI-NEXT: sxtb w11, w11
5223 ; CHECK-GI-NEXT: sxtb w12, w12
5224 ; CHECK-GI-NEXT: sxtb w13, w13
5225 ; CHECK-GI-NEXT: mov v21.s[1], wzr
5226 ; CHECK-GI-NEXT: mov v15.s[0], w11
5227 ; CHECK-GI-NEXT: ldr w11, [sp, #1040]
5228 ; CHECK-GI-NEXT: sxtb w10, w10
5229 ; CHECK-GI-NEXT: mov v14.s[1], w12
5230 ; CHECK-GI-NEXT: ldr w12, [sp, #984]
5231 ; CHECK-GI-NEXT: mov v13.s[0], w13
5232 ; CHECK-GI-NEXT: mov v19.s[0], w10
5233 ; CHECK-GI-NEXT: ldr w10, [sp, #1008]
5234 ; CHECK-GI-NEXT: sxtb w11, w11
5235 ; CHECK-GI-NEXT: sxtb w12, w12
5236 ; CHECK-GI-NEXT: ldr w13, [sp, #944]
5237 ; CHECK-GI-NEXT: sxtb w8, w8
5238 ; CHECK-GI-NEXT: mov v15.s[1], w11
5239 ; CHECK-GI-NEXT: ldr w11, [sp, #1048]
5240 ; CHECK-GI-NEXT: sxtb w10, w10
5241 ; CHECK-GI-NEXT: mov v14.s[2], w12
5242 ; CHECK-GI-NEXT: ldr w12, [sp, #1064]
5243 ; CHECK-GI-NEXT: sxtb w13, w13
5244 ; CHECK-GI-NEXT: sxtb w11, w11
5245 ; CHECK-GI-NEXT: mov v19.s[1], w10
5246 ; CHECK-GI-NEXT: ldr w10, [sp, #992]
5247 ; CHECK-GI-NEXT: sxtb w12, w12
5248 ; CHECK-GI-NEXT: mov v13.s[1], w13
5249 ; CHECK-GI-NEXT: ldr w13, [sp, #952]
5250 ; CHECK-GI-NEXT: mov v15.s[2], w11
5251 ; CHECK-GI-NEXT: ldr w11, [sp, #1016]
5252 ; CHECK-GI-NEXT: sxtb w9, w9
5253 ; CHECK-GI-NEXT: mul w8, w8, w12
5254 ; CHECK-GI-NEXT: ldr w12, [sp, #1056]
5255 ; CHECK-GI-NEXT: sxtb w10, w10
5256 ; CHECK-GI-NEXT: sxtb w13, w13
5257 ; CHECK-GI-NEXT: mov v12.s[3], w9
5258 ; CHECK-GI-NEXT: sxtb w9, w11
5259 ; CHECK-GI-NEXT: mov v14.s[3], w10
5260 ; CHECK-GI-NEXT: mov v21.s[2], wzr
5261 ; CHECK-GI-NEXT: sxtb w10, w12
5262 ; CHECK-GI-NEXT: mul v6.4s, v6.4s, v22.4s
5263 ; CHECK-GI-NEXT: mov v22.s[0], w8
5264 ; CHECK-GI-NEXT: mov v13.s[2], w13
5265 ; CHECK-GI-NEXT: mov v19.s[2], w9
5266 ; CHECK-GI-NEXT: ldr w8, [sp, #960]
5267 ; CHECK-GI-NEXT: mov v15.s[3], w10
5268 ; CHECK-GI-NEXT: ldr w9, [sp, #1024]
5269 ; CHECK-GI-NEXT: mov v25.s[1], wzr
5270 ; CHECK-GI-NEXT: mul v7.4s, v7.4s, v23.4s
5271 ; CHECK-GI-NEXT: mov v21.s[3], wzr
5272 ; CHECK-GI-NEXT: sxtb w8, w8
5273 ; CHECK-GI-NEXT: mul v23.4s, v27.4s, v11.4s
5274 ; CHECK-GI-NEXT: sxtb w9, w9
5275 ; CHECK-GI-NEXT: mov v22.s[1], wzr
5276 ; CHECK-GI-NEXT: mul v27.4s, v28.4s, v12.4s
5277 ; CHECK-GI-NEXT: mul v28.4s, v31.4s, v14.4s
5278 ; CHECK-GI-NEXT: mul v31.4s, v8.4s, v15.4s
5279 ; CHECK-GI-NEXT: mov v13.s[3], w8
5280 ; CHECK-GI-NEXT: mov v19.s[3], w9
5281 ; CHECK-GI-NEXT: mla v3.4s, v0.4s, v16.4s
5282 ; CHECK-GI-NEXT: mov v25.s[2], wzr
5283 ; CHECK-GI-NEXT: add v0.4s, v21.4s, v21.4s
5284 ; CHECK-GI-NEXT: mla v5.4s, v1.4s, v17.4s
5285 ; CHECK-GI-NEXT: mla v6.4s, v2.4s, v18.4s
5286 ; CHECK-GI-NEXT: mov v22.s[2], wzr
5287 ; CHECK-GI-NEXT: mla v7.4s, v4.4s, v20.4s
5288 ; CHECK-GI-NEXT: mla v23.4s, v24.4s, v9.4s
5289 ; CHECK-GI-NEXT: mla v27.4s, v26.4s, v10.4s
5290 ; CHECK-GI-NEXT: mla v28.4s, v29.4s, v13.4s
5291 ; CHECK-GI-NEXT: mla v31.4s, v30.4s, v19.4s
5292 ; CHECK-GI-NEXT: add v1.4s, v21.4s, v0.4s
5293 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
5294 ; CHECK-GI-NEXT: mov v25.s[3], wzr
5295 ; CHECK-GI-NEXT: add v2.4s, v3.4s, v5.4s
5296 ; CHECK-GI-NEXT: mov v22.s[3], wzr
5297 ; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
5298 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
5299 ; CHECK-GI-NEXT: add v1.4s, v23.4s, v27.4s
5300 ; CHECK-GI-NEXT: add v4.4s, v28.4s, v31.4s
5301 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
5302 ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
5303 ; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
5304 ; CHECK-GI-NEXT: add v3.4s, v25.4s, v0.4s
5305 ; CHECK-GI-NEXT: add v0.4s, v22.4s, v0.4s
5306 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v4.4s
5307 ; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
5308 ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
5309 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
5310 ; CHECK-GI-NEXT: addv s1, v2.4s
5311 ; CHECK-GI-NEXT: addv s0, v0.4s
5312 ; CHECK-GI-NEXT: fmov w8, s1
5313 ; CHECK-GI-NEXT: fmov w9, s0
5314 ; CHECK-GI-NEXT: add w0, w8, w9
5315 ; CHECK-GI-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
5316 ; CHECK-GI-NEXT: ret
5318 %az = sext <33 x i8> %a to <33 x i32>
5319 %bz = sext <33 x i8> %b to <33 x i32>
5320 %m1 = mul nuw nsw <33 x i32> %az, %bz
5321 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m1)
5322 %cz = sext <33 x i8> %c to <33 x i32>
5323 %dz = sext <33 x i8> %d to <33 x i32>
5324 %m2 = mul nuw nsw <33 x i32> %cz, %dz
5325 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m2)
5326 %x = add i32 %r1, %r2
5330 define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
5331 ; CHECK-SD-LABEL: test_sdot_v33i8_double_nomla:
5332 ; CHECK-SD: // %bb.0: // %entry
5333 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
5334 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
5335 ; CHECK-SD-NEXT: .cfi_offset w29, -16
5336 ; CHECK-SD-NEXT: ldr b0, [sp, #80]
5337 ; CHECK-SD-NEXT: add x8, sp, #88
5338 ; CHECK-SD-NEXT: ldr b2, [sp, #144]
5339 ; CHECK-SD-NEXT: add x9, sp, #152
5340 ; CHECK-SD-NEXT: ldr b3, [sp, #16]
5341 ; CHECK-SD-NEXT: add x11, sp, #104
5342 ; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
5343 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9]
5344 ; CHECK-SD-NEXT: add x9, sp, #24
5345 ; CHECK-SD-NEXT: add x8, sp, #96
5346 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x9]
5347 ; CHECK-SD-NEXT: ldr b5, [sp, #480]
5348 ; CHECK-SD-NEXT: fmov s1, w0
5349 ; CHECK-SD-NEXT: add x10, sp, #112
5350 ; CHECK-SD-NEXT: add x12, sp, #168
5351 ; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
5352 ; CHECK-SD-NEXT: add x8, sp, #160
5353 ; CHECK-SD-NEXT: ldr b4, [sp, #608]
5354 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
5355 ; CHECK-SD-NEXT: add x8, sp, #32
5356 ; CHECK-SD-NEXT: add x13, sp, #496
5357 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8]
5358 ; CHECK-SD-NEXT: mov v1.b[1], w1
5359 ; CHECK-SD-NEXT: ldr b6, [sp, #672]
5360 ; CHECK-SD-NEXT: ld1 { v0.b }[3], [x11]
5361 ; CHECK-SD-NEXT: add x11, sp, #488
5362 ; CHECK-SD-NEXT: add x9, sp, #120
5363 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x11]
5364 ; CHECK-SD-NEXT: add x11, sp, #40
5365 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x12]
5366 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x11]
5367 ; CHECK-SD-NEXT: add x12, sp, #616
5368 ; CHECK-SD-NEXT: ldr b16, [sp, #544]
5369 ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x10]
5370 ; CHECK-SD-NEXT: add x10, sp, #48
5371 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x12]
5372 ; CHECK-SD-NEXT: add x12, sp, #176
5373 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13]
5374 ; CHECK-SD-NEXT: add x13, sp, #680
5375 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x10]
5376 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
5377 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x13]
5378 ; CHECK-SD-NEXT: add x13, sp, #56
5379 ; CHECK-SD-NEXT: ld1 { v0.b }[5], [x9]
5380 ; CHECK-SD-NEXT: mov v1.b[2], w2
5381 ; CHECK-SD-NEXT: add x8, sp, #128
5382 ; CHECK-SD-NEXT: add x14, sp, #184
5383 ; CHECK-SD-NEXT: add x11, sp, #136
5384 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x13]
5385 ; CHECK-SD-NEXT: add x13, sp, #552
5386 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x14]
5387 ; CHECK-SD-NEXT: ld1 { v16.b }[1], [x13]
5388 ; CHECK-SD-NEXT: add x14, sp, #624
5389 ; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8]
5390 ; CHECK-SD-NEXT: add x8, sp, #688
5391 ; CHECK-SD-NEXT: add x13, sp, #504
5392 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x14]
5393 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
5394 ; CHECK-SD-NEXT: add x8, sp, #560
5395 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x13]
5396 ; CHECK-SD-NEXT: ld1 { v16.b }[2], [x8]
5397 ; CHECK-SD-NEXT: mov v1.b[3], w3
5398 ; CHECK-SD-NEXT: add x9, sp, #64
5399 ; CHECK-SD-NEXT: add x15, sp, #632
5400 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9]
5401 ; CHECK-SD-NEXT: ld1 { v0.b }[7], [x11]
5402 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x15]
5403 ; CHECK-SD-NEXT: add x8, sp, #696
5404 ; CHECK-SD-NEXT: add x9, sp, #568
5405 ; CHECK-SD-NEXT: add x11, sp, #512
5406 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8]
5407 ; CHECK-SD-NEXT: ld1 { v16.b }[3], [x9]
5408 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x11]
5409 ; CHECK-SD-NEXT: add x8, sp, #640
5410 ; CHECK-SD-NEXT: mov v1.b[4], w4
5411 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
5412 ; CHECK-SD-NEXT: add x8, sp, #704
5413 ; CHECK-SD-NEXT: add x9, sp, #576
5414 ; CHECK-SD-NEXT: add x11, sp, #520
5415 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8]
5416 ; CHECK-SD-NEXT: ld1 { v16.b }[4], [x9]
5417 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
5418 ; CHECK-SD-NEXT: ldr b18, [sp, #736]
5419 ; CHECK-SD-NEXT: add x12, sp, #192
5420 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x12]
5421 ; CHECK-SD-NEXT: add x8, sp, #648
5422 ; CHECK-SD-NEXT: add x9, sp, #528
5423 ; CHECK-SD-NEXT: add x11, sp, #712
5424 ; CHECK-SD-NEXT: add x12, sp, #584
5425 ; CHECK-SD-NEXT: sshll v18.8h, v18.8b, #0
5426 ; CHECK-SD-NEXT: mov v1.b[5], w5
5427 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x11]
5428 ; CHECK-SD-NEXT: ld1 { v16.b }[5], [x12]
5429 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
5430 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
5431 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
5432 ; CHECK-SD-NEXT: add x8, sp, #656
5433 ; CHECK-SD-NEXT: add x9, sp, #536
5434 ; CHECK-SD-NEXT: add x11, sp, #720
5435 ; CHECK-SD-NEXT: add x12, sp, #592
5436 ; CHECK-SD-NEXT: sshll v18.4s, v18.4h, #0
5437 ; CHECK-SD-NEXT: ldr b7, [sp, #208]
5438 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x11]
5439 ; CHECK-SD-NEXT: ld1 { v16.b }[6], [x12]
5440 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8]
5441 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x9]
5442 ; CHECK-SD-NEXT: mov v1.b[6], w6
5443 ; CHECK-SD-NEXT: sshll v7.8h, v7.8b, #0
5444 ; CHECK-SD-NEXT: add x8, sp, #664
5445 ; CHECK-SD-NEXT: add x9, sp, #728
5446 ; CHECK-SD-NEXT: add x11, sp, #600
5447 ; CHECK-SD-NEXT: mov v17.s[0], v18.s[0]
5448 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x9]
5449 ; CHECK-SD-NEXT: ld1 { v16.b }[7], [x11]
5450 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8]
5451 ; CHECK-SD-NEXT: sshll v5.8h, v5.8b, #0
5452 ; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
5453 ; CHECK-SD-NEXT: add x10, sp, #200
5454 ; CHECK-SD-NEXT: mov v1.b[7], w7
5455 ; CHECK-SD-NEXT: add x9, sp, #72
5456 ; CHECK-SD-NEXT: sshll v7.4s, v7.4h, #0
5457 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x10]
5458 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x9]
5459 ; CHECK-SD-NEXT: sshll v6.8h, v6.8b, #0
5460 ; CHECK-SD-NEXT: sshll v16.8h, v16.8b, #0
5461 ; CHECK-SD-NEXT: sshll v4.8h, v4.8b, #0
5462 ; CHECK-SD-NEXT: saddw v17.4s, v17.4s, v5.4h
5463 ; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
5464 ; CHECK-SD-NEXT: mov v18.s[0], v7.s[0]
5465 ; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
5466 ; CHECK-SD-NEXT: sshll v2.8h, v2.8b, #0
5467 ; CHECK-SD-NEXT: sshll v3.8h, v3.8b, #0
5468 ; CHECK-SD-NEXT: saddl2 v7.4s, v16.8h, v6.8h
5469 ; CHECK-SD-NEXT: saddl2 v5.4s, v5.8h, v4.8h
5470 ; CHECK-SD-NEXT: saddl v6.4s, v16.4h, v6.4h
5471 ; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v4.4h
5472 ; CHECK-SD-NEXT: saddl2 v17.4s, v1.8h, v0.8h
5473 ; CHECK-SD-NEXT: saddl2 v16.4s, v3.8h, v2.8h
5474 ; CHECK-SD-NEXT: saddw v1.4s, v18.4s, v1.4h
5475 ; CHECK-SD-NEXT: add v5.4s, v5.4s, v7.4s
5476 ; CHECK-SD-NEXT: add v4.4s, v4.4s, v6.4s
5477 ; CHECK-SD-NEXT: saddl v2.4s, v3.4h, v2.4h
5478 ; CHECK-SD-NEXT: add v6.4s, v17.4s, v16.4s
5479 ; CHECK-SD-NEXT: saddw v0.4s, v1.4s, v0.4h
5480 ; CHECK-SD-NEXT: add v1.4s, v4.4s, v5.4s
5481 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
5482 ; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
5483 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
5484 ; CHECK-SD-NEXT: addv s0, v0.4s
5485 ; CHECK-SD-NEXT: fmov w0, s0
5486 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
5487 ; CHECK-SD-NEXT: ret
5489 ; CHECK-GI-LABEL: test_sdot_v33i8_double_nomla:
5490 ; CHECK-GI: // %bb.0: // %entry
5491 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
5492 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
5493 ; CHECK-GI-NEXT: .cfi_offset w29, -16
5494 ; CHECK-GI-NEXT: sxtb w10, w0
5495 ; CHECK-GI-NEXT: ldr w9, [sp, #16]
5496 ; CHECK-GI-NEXT: ldr w8, [sp, #48]
5497 ; CHECK-GI-NEXT: sxtb w11, w4
5498 ; CHECK-GI-NEXT: sxtb w12, w6
5499 ; CHECK-GI-NEXT: ldr w13, [sp, #592]
5500 ; CHECK-GI-NEXT: mov v0.s[0], w10
5501 ; CHECK-GI-NEXT: sxtb w9, w9
5502 ; CHECK-GI-NEXT: sxtb w8, w8
5503 ; CHECK-GI-NEXT: mov v1.s[0], w11
5504 ; CHECK-GI-NEXT: sxtb w10, w1
5505 ; CHECK-GI-NEXT: sxtb w11, w5
5506 ; CHECK-GI-NEXT: mov v2.s[0], w9
5507 ; CHECK-GI-NEXT: ldr w9, [sp, #24]
5508 ; CHECK-GI-NEXT: mov v3.s[0], w8
5509 ; CHECK-GI-NEXT: ldr w8, [sp, #56]
5510 ; CHECK-GI-NEXT: mov v22.s[0], wzr
5511 ; CHECK-GI-NEXT: mov v0.s[1], w10
5512 ; CHECK-GI-NEXT: sxtb w9, w9
5513 ; CHECK-GI-NEXT: ldr w10, [sp, #80]
5514 ; CHECK-GI-NEXT: sxtb w8, w8
5515 ; CHECK-GI-NEXT: mov v1.s[1], w11
5516 ; CHECK-GI-NEXT: sxtb w11, w2
5517 ; CHECK-GI-NEXT: mov v2.s[1], w9
5518 ; CHECK-GI-NEXT: ldr w9, [sp, #32]
5519 ; CHECK-GI-NEXT: sxtb w10, w10
5520 ; CHECK-GI-NEXT: mov v3.s[1], w8
5521 ; CHECK-GI-NEXT: ldr w8, [sp, #64]
5522 ; CHECK-GI-NEXT: mov v22.s[1], wzr
5523 ; CHECK-GI-NEXT: mov v0.s[2], w11
5524 ; CHECK-GI-NEXT: sxtb w9, w9
5525 ; CHECK-GI-NEXT: sxtb w11, w3
5526 ; CHECK-GI-NEXT: sxtb w8, w8
5527 ; CHECK-GI-NEXT: mov v1.s[2], w12
5528 ; CHECK-GI-NEXT: mov v4.s[0], w10
5529 ; CHECK-GI-NEXT: ldr w10, [sp, #40]
5530 ; CHECK-GI-NEXT: mov v2.s[2], w9
5531 ; CHECK-GI-NEXT: ldr w9, [sp, #88]
5532 ; CHECK-GI-NEXT: sxtb w12, w7
5533 ; CHECK-GI-NEXT: mov v3.s[2], w8
5534 ; CHECK-GI-NEXT: ldr w8, [sp, #72]
5535 ; CHECK-GI-NEXT: mov v0.s[3], w11
5536 ; CHECK-GI-NEXT: ldr w11, [sp, #112]
5537 ; CHECK-GI-NEXT: sxtb w10, w10
5538 ; CHECK-GI-NEXT: sxtb w9, w9
5539 ; CHECK-GI-NEXT: mov v1.s[3], w12
5540 ; CHECK-GI-NEXT: ldr w12, [sp, #144]
5541 ; CHECK-GI-NEXT: sxtb w8, w8
5542 ; CHECK-GI-NEXT: sxtb w11, w11
5543 ; CHECK-GI-NEXT: mov v2.s[3], w10
5544 ; CHECK-GI-NEXT: ldr w10, [sp, #96]
5545 ; CHECK-GI-NEXT: mov v4.s[1], w9
5546 ; CHECK-GI-NEXT: ldr w9, [sp, #176]
5547 ; CHECK-GI-NEXT: sxtb w12, w12
5548 ; CHECK-GI-NEXT: mov v3.s[3], w8
5549 ; CHECK-GI-NEXT: ldr w8, [sp, #120]
5550 ; CHECK-GI-NEXT: mov v5.s[0], w11
5551 ; CHECK-GI-NEXT: sxtb w10, w10
5552 ; CHECK-GI-NEXT: sxtb w9, w9
5553 ; CHECK-GI-NEXT: ldr w11, [sp, #152]
5554 ; CHECK-GI-NEXT: mov v6.s[0], w12
5555 ; CHECK-GI-NEXT: sxtb w8, w8
5556 ; CHECK-GI-NEXT: ldr w12, [sp, #104]
5557 ; CHECK-GI-NEXT: mov v4.s[2], w10
5558 ; CHECK-GI-NEXT: ldr w10, [sp, #184]
5559 ; CHECK-GI-NEXT: mov v7.s[0], w9
5560 ; CHECK-GI-NEXT: sxtb w11, w11
5561 ; CHECK-GI-NEXT: ldr w9, [sp, #128]
5562 ; CHECK-GI-NEXT: mov v5.s[1], w8
5563 ; CHECK-GI-NEXT: sxtb w12, w12
5564 ; CHECK-GI-NEXT: sxtb w10, w10
5565 ; CHECK-GI-NEXT: ldr w8, [sp, #160]
5566 ; CHECK-GI-NEXT: mov v6.s[1], w11
5567 ; CHECK-GI-NEXT: ldr w11, [sp, #480]
5568 ; CHECK-GI-NEXT: sxtb w9, w9
5569 ; CHECK-GI-NEXT: mov v4.s[3], w12
5570 ; CHECK-GI-NEXT: ldr w12, [sp, #192]
5571 ; CHECK-GI-NEXT: mov v7.s[1], w10
5572 ; CHECK-GI-NEXT: sxtb w8, w8
5573 ; CHECK-GI-NEXT: ldr w10, [sp, #136]
5574 ; CHECK-GI-NEXT: mov v5.s[2], w9
5575 ; CHECK-GI-NEXT: sxtb w9, w11
5576 ; CHECK-GI-NEXT: sxtb w12, w12
5577 ; CHECK-GI-NEXT: ldr w11, [sp, #168]
5578 ; CHECK-GI-NEXT: mov v6.s[2], w8
5579 ; CHECK-GI-NEXT: ldr w8, [sp, #488]
5580 ; CHECK-GI-NEXT: mov v16.s[0], w9
5581 ; CHECK-GI-NEXT: sxtb w10, w10
5582 ; CHECK-GI-NEXT: ldr w9, [sp, #200]
5583 ; CHECK-GI-NEXT: mov v7.s[2], w12
5584 ; CHECK-GI-NEXT: sxtb w11, w11
5585 ; CHECK-GI-NEXT: ldr w12, [sp, #512]
5586 ; CHECK-GI-NEXT: sxtb w8, w8
5587 ; CHECK-GI-NEXT: mov v5.s[3], w10
5588 ; CHECK-GI-NEXT: ldr w10, [sp, #544]
5589 ; CHECK-GI-NEXT: sxtb w9, w9
5590 ; CHECK-GI-NEXT: mov v6.s[3], w11
5591 ; CHECK-GI-NEXT: ldr w11, [sp, #496]
5592 ; CHECK-GI-NEXT: sxtb w12, w12
5593 ; CHECK-GI-NEXT: mov v16.s[1], w8
5594 ; CHECK-GI-NEXT: ldr w8, [sp, #208]
5595 ; CHECK-GI-NEXT: mov v7.s[3], w9
5596 ; CHECK-GI-NEXT: sxtb w9, w10
5597 ; CHECK-GI-NEXT: ldr w10, [sp, #520]
5598 ; CHECK-GI-NEXT: sxtb w11, w11
5599 ; CHECK-GI-NEXT: mov v17.s[0], w12
5600 ; CHECK-GI-NEXT: ldr w12, [sp, #552]
5601 ; CHECK-GI-NEXT: sxtb w8, w8
5602 ; CHECK-GI-NEXT: mov v19.s[0], w9
5603 ; CHECK-GI-NEXT: ldr w9, [sp, #504]
5604 ; CHECK-GI-NEXT: sxtb w10, w10
5605 ; CHECK-GI-NEXT: mov v16.s[2], w11
5606 ; CHECK-GI-NEXT: ldr w11, [sp, #576]
5607 ; CHECK-GI-NEXT: mov v18.s[0], w8
5608 ; CHECK-GI-NEXT: sxtb w8, w12
5609 ; CHECK-GI-NEXT: ldr w12, [sp, #528]
5610 ; CHECK-GI-NEXT: sxtb w9, w9
5611 ; CHECK-GI-NEXT: mov v17.s[1], w10
5612 ; CHECK-GI-NEXT: sxtb w11, w11
5613 ; CHECK-GI-NEXT: ldr w10, [sp, #560]
5614 ; CHECK-GI-NEXT: mov v19.s[1], w8
5615 ; CHECK-GI-NEXT: ldr w8, [sp, #608]
5616 ; CHECK-GI-NEXT: sxtb w12, w12
5617 ; CHECK-GI-NEXT: mov v16.s[3], w9
5618 ; CHECK-GI-NEXT: ldr w9, [sp, #584]
5619 ; CHECK-GI-NEXT: mov v20.s[0], w11
5620 ; CHECK-GI-NEXT: sxtb w10, w10
5621 ; CHECK-GI-NEXT: sxtb w8, w8
5622 ; CHECK-GI-NEXT: ldr w11, [sp, #536]
5623 ; CHECK-GI-NEXT: mov v17.s[2], w12
5624 ; CHECK-GI-NEXT: sxtb w9, w9
5625 ; CHECK-GI-NEXT: mov v19.s[2], w10
5626 ; CHECK-GI-NEXT: ldr w10, [sp, #616]
5627 ; CHECK-GI-NEXT: mov v21.s[0], w8
5628 ; CHECK-GI-NEXT: ldr w12, [sp, #568]
5629 ; CHECK-GI-NEXT: sxtb w11, w11
5630 ; CHECK-GI-NEXT: mov v22.s[2], wzr
5631 ; CHECK-GI-NEXT: mov v20.s[1], w9
5632 ; CHECK-GI-NEXT: sxtb w9, w10
5633 ; CHECK-GI-NEXT: ldr w10, [sp, #640]
5634 ; CHECK-GI-NEXT: sxtb w12, w12
5635 ; CHECK-GI-NEXT: mov v17.s[3], w11
5636 ; CHECK-GI-NEXT: sxtb w11, w13
5637 ; CHECK-GI-NEXT: mov v21.s[1], w9
5638 ; CHECK-GI-NEXT: ldr w9, [sp, #672]
5639 ; CHECK-GI-NEXT: sxtb w10, w10
5640 ; CHECK-GI-NEXT: mov v19.s[3], w12
5641 ; CHECK-GI-NEXT: ldr w12, [sp, #624]
5642 ; CHECK-GI-NEXT: ldr w8, [sp, #600]
5643 ; CHECK-GI-NEXT: mov v20.s[2], w11
5644 ; CHECK-GI-NEXT: ldr w11, [sp, #704]
5645 ; CHECK-GI-NEXT: sxtb w9, w9
5646 ; CHECK-GI-NEXT: sxtb w12, w12
5647 ; CHECK-GI-NEXT: mov v23.s[0], w10
5648 ; CHECK-GI-NEXT: ldr w10, [sp, #648]
5649 ; CHECK-GI-NEXT: sxtb w11, w11
5650 ; CHECK-GI-NEXT: mov v24.s[0], w9
5651 ; CHECK-GI-NEXT: ldr w9, [sp, #736]
5652 ; CHECK-GI-NEXT: mov v21.s[2], w12
5653 ; CHECK-GI-NEXT: ldr w12, [sp, #680]
5654 ; CHECK-GI-NEXT: sxtb w10, w10
5655 ; CHECK-GI-NEXT: mov v25.s[0], w11
5656 ; CHECK-GI-NEXT: ldr w11, [sp, #712]
5657 ; CHECK-GI-NEXT: sxtb w9, w9
5658 ; CHECK-GI-NEXT: sxtb w12, w12
5659 ; CHECK-GI-NEXT: mov v23.s[1], w10
5660 ; CHECK-GI-NEXT: ldr w10, [sp, #656]
5661 ; CHECK-GI-NEXT: sxtb w11, w11
5662 ; CHECK-GI-NEXT: mov v26.s[0], w9
5663 ; CHECK-GI-NEXT: ldr w9, [sp, #720]
5664 ; CHECK-GI-NEXT: mov v24.s[1], w12
5665 ; CHECK-GI-NEXT: ldr w12, [sp, #688]
5666 ; CHECK-GI-NEXT: sxtb w10, w10
5667 ; CHECK-GI-NEXT: mov v25.s[1], w11
5668 ; CHECK-GI-NEXT: sxtb w9, w9
5669 ; CHECK-GI-NEXT: mov v18.s[1], wzr
5670 ; CHECK-GI-NEXT: sxtb w12, w12
5671 ; CHECK-GI-NEXT: ldr w11, [sp, #632]
5672 ; CHECK-GI-NEXT: mov v23.s[2], w10
5673 ; CHECK-GI-NEXT: mov v26.s[1], wzr
5674 ; CHECK-GI-NEXT: ldr w10, [sp, #664]
5675 ; CHECK-GI-NEXT: sxtb w8, w8
5676 ; CHECK-GI-NEXT: mov v24.s[2], w12
5677 ; CHECK-GI-NEXT: ldr w12, [sp, #696]
5678 ; CHECK-GI-NEXT: mov v22.s[3], wzr
5679 ; CHECK-GI-NEXT: mov v25.s[2], w9
5680 ; CHECK-GI-NEXT: ldr w9, [sp, #728]
5681 ; CHECK-GI-NEXT: mov v20.s[3], w8
5682 ; CHECK-GI-NEXT: sxtb w8, w11
5683 ; CHECK-GI-NEXT: sxtb w10, w10
5684 ; CHECK-GI-NEXT: sxtb w11, w12
5685 ; CHECK-GI-NEXT: sxtb w9, w9
5686 ; CHECK-GI-NEXT: mov v18.s[2], wzr
5687 ; CHECK-GI-NEXT: mov v26.s[2], wzr
5688 ; CHECK-GI-NEXT: mov v21.s[3], w8
5689 ; CHECK-GI-NEXT: mov v23.s[3], w10
5690 ; CHECK-GI-NEXT: mov v24.s[3], w11
5691 ; CHECK-GI-NEXT: mov v25.s[3], w9
5692 ; CHECK-GI-NEXT: add v27.4s, v22.4s, v22.4s
5693 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
5694 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
5695 ; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
5696 ; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
5697 ; CHECK-GI-NEXT: mov v18.s[3], wzr
5698 ; CHECK-GI-NEXT: mov v26.s[3], wzr
5699 ; CHECK-GI-NEXT: add v4.4s, v16.4s, v17.4s
5700 ; CHECK-GI-NEXT: add v5.4s, v22.4s, v27.4s
5701 ; CHECK-GI-NEXT: add v6.4s, v19.4s, v20.4s
5702 ; CHECK-GI-NEXT: add v7.4s, v21.4s, v23.4s
5703 ; CHECK-GI-NEXT: add v16.4s, v24.4s, v25.4s
5704 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
5705 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
5706 ; CHECK-GI-NEXT: add v3.4s, v27.4s, v27.4s
5707 ; CHECK-GI-NEXT: add v2.4s, v18.4s, v5.4s
5708 ; CHECK-GI-NEXT: add v4.4s, v4.4s, v6.4s
5709 ; CHECK-GI-NEXT: add v5.4s, v26.4s, v5.4s
5710 ; CHECK-GI-NEXT: add v6.4s, v7.4s, v16.4s
5711 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
5712 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
5713 ; CHECK-GI-NEXT: add v3.4s, v5.4s, v3.4s
5714 ; CHECK-GI-NEXT: add v2.4s, v4.4s, v6.4s
5715 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
5716 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
5717 ; CHECK-GI-NEXT: addv s0, v0.4s
5718 ; CHECK-GI-NEXT: addv s1, v1.4s
5719 ; CHECK-GI-NEXT: fmov w8, s0
5720 ; CHECK-GI-NEXT: fmov w9, s1
5721 ; CHECK-GI-NEXT: add w0, w8, w9
5722 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
5723 ; CHECK-GI-NEXT: ret
5725 %az = sext <33 x i8> %a to <33 x i32>
5726 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %az)
5727 %cz = sext <33 x i8> %c to <33 x i32>
5728 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %cz)
5729 %x = add i32 %r1, %r2
5733 define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
5734 ; CHECK-SD-LABEL: test_udot_v48i8:
5735 ; CHECK-SD: // %bb.0: // %entry
5736 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
5737 ; CHECK-SD-NEXT: ldr q1, [x0, #32]
5738 ; CHECK-SD-NEXT: ldr q2, [x1, #32]
5739 ; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b
5740 ; CHECK-SD-NEXT: ldp q3, q1, [x0]
5741 ; CHECK-SD-NEXT: ldp q4, q2, [x1]
5742 ; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b
5743 ; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b
5744 ; CHECK-SD-NEXT: addv s0, v0.4s
5745 ; CHECK-SD-NEXT: fmov w8, s0
5746 ; CHECK-SD-NEXT: add w0, w8, w2
5747 ; CHECK-SD-NEXT: ret
5749 ; CHECK-GI-LABEL: test_udot_v48i8:
5750 ; CHECK-GI: // %bb.0: // %entry
5751 ; CHECK-GI-NEXT: mov v0.s[0], wzr
5752 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
5753 ; CHECK-GI-NEXT: ldr q7, [x0, #32]
5754 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
5755 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
5756 ; CHECK-GI-NEXT: ldr q17, [x1, #32]
5757 ; CHECK-GI-NEXT: ldp q4, q5, [x0]
5758 ; CHECK-GI-NEXT: ldp q6, q16, [x1]
5759 ; CHECK-GI-NEXT: mov v0.s[1], wzr
5760 ; CHECK-GI-NEXT: udot v2.4s, v17.16b, v7.16b
5761 ; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b
5762 ; CHECK-GI-NEXT: udot v3.4s, v16.16b, v5.16b
5763 ; CHECK-GI-NEXT: mov v0.s[2], wzr
5764 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
5765 ; CHECK-GI-NEXT: mov v0.s[3], wzr
5766 ; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
5767 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
5768 ; CHECK-GI-NEXT: addv s0, v0.4s
5769 ; CHECK-GI-NEXT: fmov w8, s0
5770 ; CHECK-GI-NEXT: add w0, w8, w2
5771 ; CHECK-GI-NEXT: ret
5773 %0 = load <48 x i8>, ptr %a
5774 %1 = zext <48 x i8> %0 to <48 x i32>
5775 %2 = load <48 x i8>, ptr %b
5776 %3 = zext <48 x i8> %2 to <48 x i32>
5777 %4 = mul nuw nsw <48 x i32> %3, %1
5778 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
5779 %op.extra = add i32 %5, %sum
5783 define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
5784 ; CHECK-SD-LABEL: test_udot_v48i8_nomla:
5785 ; CHECK-SD: // %bb.0: // %entry
5786 ; CHECK-SD-NEXT: movi v0.16b, #1
5787 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
5788 ; CHECK-SD-NEXT: ldr q2, [x0, #32]
5789 ; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b
5790 ; CHECK-SD-NEXT: ldp q3, q2, [x0]
5791 ; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
5792 ; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b
5793 ; CHECK-SD-NEXT: addv s0, v1.4s
5794 ; CHECK-SD-NEXT: fmov w0, s0
5795 ; CHECK-SD-NEXT: ret
5797 ; CHECK-GI-LABEL: test_udot_v48i8_nomla:
5798 ; CHECK-GI: // %bb.0: // %entry
5799 ; CHECK-GI-NEXT: mov v0.s[0], wzr
5800 ; CHECK-GI-NEXT: movi v1.16b, #1
5801 ; CHECK-GI-NEXT: ldr q7, [x0, #32]
5802 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
5803 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
5804 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
5805 ; CHECK-GI-NEXT: ldp q5, q6, [x0]
5806 ; CHECK-GI-NEXT: mov v0.s[1], wzr
5807 ; CHECK-GI-NEXT: udot v2.4s, v5.16b, v1.16b
5808 ; CHECK-GI-NEXT: udot v4.4s, v6.16b, v1.16b
5809 ; CHECK-GI-NEXT: udot v3.4s, v7.16b, v1.16b
5810 ; CHECK-GI-NEXT: mov v0.s[2], wzr
5811 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s
5812 ; CHECK-GI-NEXT: mov v0.s[3], wzr
5813 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
5814 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
5815 ; CHECK-GI-NEXT: addv s0, v0.4s
5816 ; CHECK-GI-NEXT: fmov w0, s0
5817 ; CHECK-GI-NEXT: ret
5819 %0 = load <48 x i8>, ptr %a1
5820 %1 = zext <48 x i8> %0 to <48 x i32>
5821 %2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %1)
5824 define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
5825 ; CHECK-SD-LABEL: test_sdot_v48i8:
5826 ; CHECK-SD: // %bb.0: // %entry
5827 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
5828 ; CHECK-SD-NEXT: ldr q1, [x0, #32]
5829 ; CHECK-SD-NEXT: ldr q2, [x1, #32]
5830 ; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b
5831 ; CHECK-SD-NEXT: ldp q3, q1, [x0]
5832 ; CHECK-SD-NEXT: ldp q4, q2, [x1]
5833 ; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v3.16b
5834 ; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b
5835 ; CHECK-SD-NEXT: addv s0, v0.4s
5836 ; CHECK-SD-NEXT: fmov w8, s0
5837 ; CHECK-SD-NEXT: add w0, w8, w2
5838 ; CHECK-SD-NEXT: ret
5840 ; CHECK-GI-LABEL: test_sdot_v48i8:
5841 ; CHECK-GI: // %bb.0: // %entry
5842 ; CHECK-GI-NEXT: mov v0.s[0], wzr
5843 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
5844 ; CHECK-GI-NEXT: ldr q7, [x0, #32]
5845 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
5846 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
5847 ; CHECK-GI-NEXT: ldr q17, [x1, #32]
5848 ; CHECK-GI-NEXT: ldp q4, q5, [x0]
5849 ; CHECK-GI-NEXT: ldp q6, q16, [x1]
5850 ; CHECK-GI-NEXT: mov v0.s[1], wzr
5851 ; CHECK-GI-NEXT: sdot v2.4s, v17.16b, v7.16b
5852 ; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b
5853 ; CHECK-GI-NEXT: sdot v3.4s, v16.16b, v5.16b
5854 ; CHECK-GI-NEXT: mov v0.s[2], wzr
5855 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
5856 ; CHECK-GI-NEXT: mov v0.s[3], wzr
5857 ; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
5858 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
5859 ; CHECK-GI-NEXT: addv s0, v0.4s
5860 ; CHECK-GI-NEXT: fmov w8, s0
5861 ; CHECK-GI-NEXT: add w0, w8, w2
5862 ; CHECK-GI-NEXT: ret
5864 %0 = load <48 x i8>, ptr %a
5865 %1 = sext <48 x i8> %0 to <48 x i32>
5866 %2 = load <48 x i8>, ptr %b
5867 %3 = sext <48 x i8> %2 to <48 x i32>
5868 %4 = mul nsw <48 x i32> %3, %1
5869 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
5870 %op.extra = add nsw i32 %5, %sum
5874 define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
5875 ; CHECK-SD-LABEL: test_sdot_v48i8_double:
5876 ; CHECK-SD: // %bb.0: // %entry
5877 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
5878 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
5879 ; CHECK-SD-NEXT: .cfi_offset w29, -16
5880 ; CHECK-SD-NEXT: ldr b3, [sp, #592]
5881 ; CHECK-SD-NEXT: add x8, sp, #600
5882 ; CHECK-SD-NEXT: ldr b6, [sp, #208]
5883 ; CHECK-SD-NEXT: ldr b0, [sp, #336]
5884 ; CHECK-SD-NEXT: add x9, sp, #344
5885 ; CHECK-SD-NEXT: ldr b2, [sp, #464]
5886 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8]
5887 ; CHECK-SD-NEXT: add x8, sp, #216
5888 ; CHECK-SD-NEXT: add x10, sp, #624
5889 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x8]
5890 ; CHECK-SD-NEXT: add x8, sp, #608
5891 ; CHECK-SD-NEXT: ld1 { v0.b }[1], [x9]
5892 ; CHECK-SD-NEXT: add x9, sp, #232
5893 ; CHECK-SD-NEXT: fmov s1, w0
5894 ; CHECK-SD-NEXT: ldr b7, [sp, #1360]
5895 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8]
5896 ; CHECK-SD-NEXT: add x8, sp, #224
5897 ; CHECK-SD-NEXT: add x11, sp, #648
5898 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
5899 ; CHECK-SD-NEXT: add x8, sp, #616
5900 ; CHECK-SD-NEXT: add x12, sp, #376
5901 ; CHECK-SD-NEXT: mov v1.b[1], w1
5902 ; CHECK-SD-NEXT: ldr b16, [sp, #976]
5903 ; CHECK-SD-NEXT: add x14, sp, #288
5904 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8]
5905 ; CHECK-SD-NEXT: add x8, sp, #632
5906 ; CHECK-SD-NEXT: add x15, sp, #408
5907 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x9]
5908 ; CHECK-SD-NEXT: add x9, sp, #472
5909 ; CHECK-SD-NEXT: add x13, sp, #696
5910 ; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9]
5911 ; CHECK-SD-NEXT: add x9, sp, #240
5912 ; CHECK-SD-NEXT: add x16, sp, #448
5913 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x10]
5914 ; CHECK-SD-NEXT: add x10, sp, #352
5915 ; CHECK-SD-NEXT: mov v1.b[2], w2
5916 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x9]
5917 ; CHECK-SD-NEXT: ld1 { v0.b }[2], [x10]
5918 ; CHECK-SD-NEXT: add x10, sp, #1368
5919 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x10]
5920 ; CHECK-SD-NEXT: add x10, sp, #248
5921 ; CHECK-SD-NEXT: add x9, sp, #640
5922 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8]
5923 ; CHECK-SD-NEXT: add x8, sp, #656
5924 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
5925 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x10]
5926 ; CHECK-SD-NEXT: add x10, sp, #360
5927 ; CHECK-SD-NEXT: mov v1.b[3], w3
5928 ; CHECK-SD-NEXT: ld1 { v0.b }[3], [x10]
5929 ; CHECK-SD-NEXT: add x10, sp, #256
5930 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
5931 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9]
5932 ; CHECK-SD-NEXT: add x9, sp, #368
5933 ; CHECK-SD-NEXT: ldr b17, [sp, #720]
5934 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x10]
5935 ; CHECK-SD-NEXT: add x10, sp, #984
5936 ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x9]
5937 ; CHECK-SD-NEXT: ld1 { v16.b }[1], [x10]
5938 ; CHECK-SD-NEXT: add x10, sp, #664
5939 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x11]
5940 ; CHECK-SD-NEXT: add x11, sp, #264
5941 ; CHECK-SD-NEXT: mov v1.b[4], w4
5942 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x11]
5943 ; CHECK-SD-NEXT: add x9, sp, #672
5944 ; CHECK-SD-NEXT: add x11, sp, #680
5945 ; CHECK-SD-NEXT: ld1 { v0.b }[5], [x12]
5946 ; CHECK-SD-NEXT: add x12, sp, #480
5947 ; CHECK-SD-NEXT: ld1 { v2.b }[2], [x12]
5948 ; CHECK-SD-NEXT: add x12, sp, #272
5949 ; CHECK-SD-NEXT: ld1 { v3.b }[8], [x8]
5950 ; CHECK-SD-NEXT: ld1 { v6.b }[8], [x12]
5951 ; CHECK-SD-NEXT: add x12, sp, #384
5952 ; CHECK-SD-NEXT: mov v1.b[5], w5
5953 ; CHECK-SD-NEXT: ld1 { v0.b }[6], [x12]
5954 ; CHECK-SD-NEXT: add x12, sp, #280
5955 ; CHECK-SD-NEXT: add x8, sp, #688
5956 ; CHECK-SD-NEXT: ld1 { v3.b }[9], [x10]
5957 ; CHECK-SD-NEXT: add x10, sp, #1376
5958 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10]
5959 ; CHECK-SD-NEXT: add x10, sp, #392
5960 ; CHECK-SD-NEXT: ld1 { v6.b }[9], [x12]
5961 ; CHECK-SD-NEXT: ld1 { v0.b }[7], [x10]
5962 ; CHECK-SD-NEXT: mov v1.b[6], w6
5963 ; CHECK-SD-NEXT: add x12, sp, #704
5964 ; CHECK-SD-NEXT: ld1 { v3.b }[10], [x9]
5965 ; CHECK-SD-NEXT: add x9, sp, #400
5966 ; CHECK-SD-NEXT: add x10, sp, #712
5967 ; CHECK-SD-NEXT: ld1 { v6.b }[10], [x14]
5968 ; CHECK-SD-NEXT: add x14, sp, #992
5969 ; CHECK-SD-NEXT: ld1 { v0.b }[8], [x9]
5970 ; CHECK-SD-NEXT: ld1 { v16.b }[2], [x14]
5971 ; CHECK-SD-NEXT: add x14, sp, #296
5972 ; CHECK-SD-NEXT: ld1 { v3.b }[11], [x11]
5973 ; CHECK-SD-NEXT: add x9, sp, #304
5974 ; CHECK-SD-NEXT: add x11, sp, #312
5975 ; CHECK-SD-NEXT: ld1 { v6.b }[11], [x14]
5976 ; CHECK-SD-NEXT: mov v1.b[7], w7
5977 ; CHECK-SD-NEXT: add x14, sp, #320
5978 ; CHECK-SD-NEXT: ld1 { v0.b }[9], [x15]
5979 ; CHECK-SD-NEXT: add x15, sp, #328
5980 ; CHECK-SD-NEXT: ld1 { v3.b }[12], [x8]
5981 ; CHECK-SD-NEXT: add x8, sp, #416
5982 ; CHECK-SD-NEXT: ld1 { v6.b }[12], [x9]
5983 ; CHECK-SD-NEXT: add x9, sp, #1384
5984 ; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8]
5985 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
5986 ; CHECK-SD-NEXT: add x9, sp, #424
5987 ; CHECK-SD-NEXT: ld1 { v3.b }[13], [x13]
5988 ; CHECK-SD-NEXT: add x8, sp, #432
5989 ; CHECK-SD-NEXT: add x13, sp, #440
5990 ; CHECK-SD-NEXT: ld1 { v6.b }[13], [x11]
5991 ; CHECK-SD-NEXT: add x11, sp, #16
5992 ; CHECK-SD-NEXT: ld1 { v0.b }[11], [x9]
5993 ; CHECK-SD-NEXT: add x9, sp, #1000
5994 ; CHECK-SD-NEXT: ld1 { v1.b }[8], [x11]
5995 ; CHECK-SD-NEXT: ld1 { v16.b }[3], [x9]
5996 ; CHECK-SD-NEXT: ld1 { v3.b }[14], [x12]
5997 ; CHECK-SD-NEXT: add x12, sp, #488
5998 ; CHECK-SD-NEXT: ld1 { v6.b }[14], [x14]
5999 ; CHECK-SD-NEXT: add x14, sp, #1392
6000 ; CHECK-SD-NEXT: ld1 { v2.b }[3], [x12]
6001 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14]
6002 ; CHECK-SD-NEXT: add x11, sp, #1008
6003 ; CHECK-SD-NEXT: ld1 { v0.b }[12], [x8]
6004 ; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11]
6005 ; CHECK-SD-NEXT: add x8, sp, #1400
6006 ; CHECK-SD-NEXT: ld1 { v3.b }[15], [x10]
6007 ; CHECK-SD-NEXT: add x10, sp, #496
6008 ; CHECK-SD-NEXT: add x9, sp, #24
6009 ; CHECK-SD-NEXT: ld1 { v6.b }[15], [x15]
6010 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8]
6011 ; CHECK-SD-NEXT: ld1 { v2.b }[4], [x10]
6012 ; CHECK-SD-NEXT: add x10, sp, #1016
6013 ; CHECK-SD-NEXT: ld1 { v16.b }[5], [x10]
6014 ; CHECK-SD-NEXT: ld1 { v0.b }[13], [x13]
6015 ; CHECK-SD-NEXT: add x8, sp, #1408
6016 ; CHECK-SD-NEXT: ld1 { v1.b }[9], [x9]
6017 ; CHECK-SD-NEXT: add x9, sp, #504
6018 ; CHECK-SD-NEXT: add x10, sp, #512
6019 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8]
6020 ; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9]
6021 ; CHECK-SD-NEXT: add x9, sp, #1024
6022 ; CHECK-SD-NEXT: add x8, sp, #32
6023 ; CHECK-SD-NEXT: ld1 { v16.b }[6], [x9]
6024 ; CHECK-SD-NEXT: ld1 { v0.b }[14], [x16]
6025 ; CHECK-SD-NEXT: ld1 { v1.b }[10], [x8]
6026 ; CHECK-SD-NEXT: add x8, sp, #1416
6027 ; CHECK-SD-NEXT: add x9, sp, #456
6028 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8]
6029 ; CHECK-SD-NEXT: ld1 { v2.b }[6], [x10]
6030 ; CHECK-SD-NEXT: add x10, sp, #1032
6031 ; CHECK-SD-NEXT: add x8, sp, #40
6032 ; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10]
6033 ; CHECK-SD-NEXT: ld1 { v0.b }[15], [x9]
6034 ; CHECK-SD-NEXT: ld1 { v1.b }[11], [x8]
6035 ; CHECK-SD-NEXT: add x8, sp, #1424
6036 ; CHECK-SD-NEXT: add x9, sp, #520
6037 ; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8]
6038 ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x9]
6039 ; CHECK-SD-NEXT: add x9, sp, #1040
6040 ; CHECK-SD-NEXT: add x8, sp, #48
6041 ; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9]
6042 ; CHECK-SD-NEXT: add x10, sp, #528
6043 ; CHECK-SD-NEXT: ld1 { v1.b }[12], [x8]
6044 ; CHECK-SD-NEXT: add x8, sp, #1432
6045 ; CHECK-SD-NEXT: sdot v5.4s, v6.16b, v3.16b
6046 ; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8]
6047 ; CHECK-SD-NEXT: ld1 { v2.b }[8], [x10]
6048 ; CHECK-SD-NEXT: add x8, sp, #1048
6049 ; CHECK-SD-NEXT: ldr b3, [sp, #80]
6050 ; CHECK-SD-NEXT: ld1 { v16.b }[9], [x8]
6051 ; CHECK-SD-NEXT: add x10, sp, #88
6052 ; CHECK-SD-NEXT: add x8, sp, #536
6053 ; CHECK-SD-NEXT: add x11, sp, #1440
6054 ; CHECK-SD-NEXT: add x9, sp, #56
6055 ; CHECK-SD-NEXT: ld1 { v3.b }[1], [x10]
6056 ; CHECK-SD-NEXT: ld1 { v2.b }[9], [x8]
6057 ; CHECK-SD-NEXT: add x8, sp, #1056
6058 ; CHECK-SD-NEXT: ld1 { v7.b }[10], [x11]
6059 ; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8]
6060 ; CHECK-SD-NEXT: ld1 { v1.b }[13], [x9]
6061 ; CHECK-SD-NEXT: add x9, sp, #96
6062 ; CHECK-SD-NEXT: add x8, sp, #544
6063 ; CHECK-SD-NEXT: add x10, sp, #1448
6064 ; CHECK-SD-NEXT: ld1 { v3.b }[2], [x9]
6065 ; CHECK-SD-NEXT: ld1 { v2.b }[10], [x8]
6066 ; CHECK-SD-NEXT: add x8, sp, #1064
6067 ; CHECK-SD-NEXT: ld1 { v7.b }[11], [x10]
6068 ; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8]
6069 ; CHECK-SD-NEXT: add x10, sp, #104
6070 ; CHECK-SD-NEXT: add x8, sp, #552
6071 ; CHECK-SD-NEXT: add x11, sp, #1456
6072 ; CHECK-SD-NEXT: add x9, sp, #64
6073 ; CHECK-SD-NEXT: ld1 { v3.b }[3], [x10]
6074 ; CHECK-SD-NEXT: ld1 { v2.b }[11], [x8]
6075 ; CHECK-SD-NEXT: add x8, sp, #1072
6076 ; CHECK-SD-NEXT: ld1 { v7.b }[12], [x11]
6077 ; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8]
6078 ; CHECK-SD-NEXT: ld1 { v1.b }[14], [x9]
6079 ; CHECK-SD-NEXT: add x9, sp, #112
6080 ; CHECK-SD-NEXT: add x8, sp, #560
6081 ; CHECK-SD-NEXT: add x10, sp, #1464
6082 ; CHECK-SD-NEXT: ld1 { v3.b }[4], [x9]
6083 ; CHECK-SD-NEXT: ld1 { v2.b }[12], [x8]
6084 ; CHECK-SD-NEXT: add x8, sp, #1080
6085 ; CHECK-SD-NEXT: ld1 { v7.b }[13], [x10]
6086 ; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8]
6087 ; CHECK-SD-NEXT: add x10, sp, #120
6088 ; CHECK-SD-NEXT: add x8, sp, #568
6089 ; CHECK-SD-NEXT: add x11, sp, #1472
6090 ; CHECK-SD-NEXT: add x9, sp, #72
6091 ; CHECK-SD-NEXT: ld1 { v3.b }[5], [x10]
6092 ; CHECK-SD-NEXT: ld1 { v2.b }[13], [x8]
6093 ; CHECK-SD-NEXT: add x8, sp, #1088
6094 ; CHECK-SD-NEXT: ld1 { v7.b }[14], [x11]
6095 ; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8]
6096 ; CHECK-SD-NEXT: ld1 { v1.b }[15], [x9]
6097 ; CHECK-SD-NEXT: add x9, sp, #128
6098 ; CHECK-SD-NEXT: ldr b6, [sp, #1104]
6099 ; CHECK-SD-NEXT: add x10, sp, #1480
6100 ; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9]
6101 ; CHECK-SD-NEXT: add x8, sp, #1096
6102 ; CHECK-SD-NEXT: add x9, sp, #1112
6103 ; CHECK-SD-NEXT: ld1 { v7.b }[15], [x10]
6104 ; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8]
6105 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9]
6106 ; CHECK-SD-NEXT: add x8, sp, #728
6107 ; CHECK-SD-NEXT: add x9, sp, #576
6108 ; CHECK-SD-NEXT: add x10, sp, #136
6109 ; CHECK-SD-NEXT: ld1 { v17.b }[1], [x8]
6110 ; CHECK-SD-NEXT: add x8, sp, #1120
6111 ; CHECK-SD-NEXT: ld1 { v2.b }[14], [x9]
6112 ; CHECK-SD-NEXT: sdot v4.4s, v16.16b, v7.16b
6113 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
6114 ; CHECK-SD-NEXT: add x8, sp, #736
6115 ; CHECK-SD-NEXT: ldr b7, [sp, #1232]
6116 ; CHECK-SD-NEXT: ldr b16, [sp, #848]
6117 ; CHECK-SD-NEXT: ld1 { v3.b }[7], [x10]
6118 ; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8]
6119 ; CHECK-SD-NEXT: add x9, sp, #1240
6120 ; CHECK-SD-NEXT: add x10, sp, #856
6121 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x9]
6122 ; CHECK-SD-NEXT: ld1 { v16.b }[1], [x10]
6123 ; CHECK-SD-NEXT: add x8, sp, #1128
6124 ; CHECK-SD-NEXT: add x11, sp, #744
6125 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8]
6126 ; CHECK-SD-NEXT: add x10, sp, #1248
6127 ; CHECK-SD-NEXT: ld1 { v17.b }[3], [x11]
6128 ; CHECK-SD-NEXT: add x11, sp, #864
6129 ; CHECK-SD-NEXT: add x9, sp, #144
6130 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10]
6131 ; CHECK-SD-NEXT: ld1 { v16.b }[2], [x11]
6132 ; CHECK-SD-NEXT: add x8, sp, #1136
6133 ; CHECK-SD-NEXT: add x12, sp, #752
6134 ; CHECK-SD-NEXT: ld1 { v3.b }[8], [x9]
6135 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8]
6136 ; CHECK-SD-NEXT: ld1 { v17.b }[4], [x12]
6137 ; CHECK-SD-NEXT: add x9, sp, #1256
6138 ; CHECK-SD-NEXT: add x10, sp, #872
6139 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
6140 ; CHECK-SD-NEXT: ld1 { v16.b }[3], [x10]
6141 ; CHECK-SD-NEXT: add x8, sp, #1144
6142 ; CHECK-SD-NEXT: add x11, sp, #760
6143 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8]
6144 ; CHECK-SD-NEXT: add x10, sp, #1264
6145 ; CHECK-SD-NEXT: ld1 { v17.b }[5], [x11]
6146 ; CHECK-SD-NEXT: add x11, sp, #880
6147 ; CHECK-SD-NEXT: add x9, sp, #152
6148 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
6149 ; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11]
6150 ; CHECK-SD-NEXT: add x8, sp, #1152
6151 ; CHECK-SD-NEXT: add x12, sp, #768
6152 ; CHECK-SD-NEXT: ld1 { v3.b }[9], [x9]
6153 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8]
6154 ; CHECK-SD-NEXT: ld1 { v17.b }[6], [x12]
6155 ; CHECK-SD-NEXT: add x9, sp, #1272
6156 ; CHECK-SD-NEXT: add x10, sp, #888
6157 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x9]
6158 ; CHECK-SD-NEXT: ld1 { v16.b }[5], [x10]
6159 ; CHECK-SD-NEXT: add x8, sp, #1160
6160 ; CHECK-SD-NEXT: add x11, sp, #776
6161 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8]
6162 ; CHECK-SD-NEXT: add x10, sp, #1280
6163 ; CHECK-SD-NEXT: ld1 { v17.b }[7], [x11]
6164 ; CHECK-SD-NEXT: add x11, sp, #896
6165 ; CHECK-SD-NEXT: add x9, sp, #160
6166 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
6167 ; CHECK-SD-NEXT: ld1 { v16.b }[6], [x11]
6168 ; CHECK-SD-NEXT: add x8, sp, #1168
6169 ; CHECK-SD-NEXT: add x12, sp, #784
6170 ; CHECK-SD-NEXT: ld1 { v3.b }[10], [x9]
6171 ; CHECK-SD-NEXT: ld1 { v6.b }[8], [x8]
6172 ; CHECK-SD-NEXT: ld1 { v17.b }[8], [x12]
6173 ; CHECK-SD-NEXT: add x9, sp, #1288
6174 ; CHECK-SD-NEXT: add x10, sp, #904
6175 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9]
6176 ; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10]
6177 ; CHECK-SD-NEXT: add x8, sp, #1176
6178 ; CHECK-SD-NEXT: add x11, sp, #792
6179 ; CHECK-SD-NEXT: ld1 { v6.b }[9], [x8]
6180 ; CHECK-SD-NEXT: add x10, sp, #1296
6181 ; CHECK-SD-NEXT: ld1 { v17.b }[9], [x11]
6182 ; CHECK-SD-NEXT: add x11, sp, #912
6183 ; CHECK-SD-NEXT: add x9, sp, #168
6184 ; CHECK-SD-NEXT: ld1 { v7.b }[8], [x10]
6185 ; CHECK-SD-NEXT: ld1 { v16.b }[8], [x11]
6186 ; CHECK-SD-NEXT: add x8, sp, #1184
6187 ; CHECK-SD-NEXT: add x12, sp, #800
6188 ; CHECK-SD-NEXT: ld1 { v3.b }[11], [x9]
6189 ; CHECK-SD-NEXT: ld1 { v6.b }[10], [x8]
6190 ; CHECK-SD-NEXT: ld1 { v17.b }[10], [x12]
6191 ; CHECK-SD-NEXT: add x9, sp, #1304
6192 ; CHECK-SD-NEXT: add x10, sp, #920
6193 ; CHECK-SD-NEXT: ld1 { v7.b }[9], [x9]
6194 ; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10]
6195 ; CHECK-SD-NEXT: add x8, sp, #1192
6196 ; CHECK-SD-NEXT: add x11, sp, #808
6197 ; CHECK-SD-NEXT: ld1 { v6.b }[11], [x8]
6198 ; CHECK-SD-NEXT: add x10, sp, #1312
6199 ; CHECK-SD-NEXT: ld1 { v17.b }[11], [x11]
6200 ; CHECK-SD-NEXT: add x11, sp, #928
6201 ; CHECK-SD-NEXT: add x9, sp, #176
6202 ; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10]
6203 ; CHECK-SD-NEXT: ld1 { v16.b }[10], [x11]
6204 ; CHECK-SD-NEXT: add x8, sp, #1200
6205 ; CHECK-SD-NEXT: add x12, sp, #816
6206 ; CHECK-SD-NEXT: ld1 { v3.b }[12], [x9]
6207 ; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8]
6208 ; CHECK-SD-NEXT: ld1 { v17.b }[12], [x12]
6209 ; CHECK-SD-NEXT: add x9, sp, #1320
6210 ; CHECK-SD-NEXT: add x10, sp, #936
6211 ; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9]
6212 ; CHECK-SD-NEXT: ld1 { v16.b }[11], [x10]
6213 ; CHECK-SD-NEXT: add x8, sp, #1208
6214 ; CHECK-SD-NEXT: add x11, sp, #824
6215 ; CHECK-SD-NEXT: ld1 { v6.b }[13], [x8]
6216 ; CHECK-SD-NEXT: add x10, sp, #1328
6217 ; CHECK-SD-NEXT: ld1 { v17.b }[13], [x11]
6218 ; CHECK-SD-NEXT: add x11, sp, #944
6219 ; CHECK-SD-NEXT: add x9, sp, #184
6220 ; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10]
6221 ; CHECK-SD-NEXT: ld1 { v16.b }[12], [x11]
6222 ; CHECK-SD-NEXT: add x8, sp, #1216
6223 ; CHECK-SD-NEXT: add x12, sp, #832
6224 ; CHECK-SD-NEXT: ld1 { v3.b }[13], [x9]
6225 ; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8]
6226 ; CHECK-SD-NEXT: ld1 { v17.b }[14], [x12]
6227 ; CHECK-SD-NEXT: add x9, sp, #1336
6228 ; CHECK-SD-NEXT: add x10, sp, #952
6229 ; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9]
6230 ; CHECK-SD-NEXT: ld1 { v16.b }[13], [x10]
6231 ; CHECK-SD-NEXT: add x8, sp, #1224
6232 ; CHECK-SD-NEXT: add x11, sp, #840
6233 ; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8]
6234 ; CHECK-SD-NEXT: add x8, sp, #192
6235 ; CHECK-SD-NEXT: ld1 { v17.b }[15], [x11]
6236 ; CHECK-SD-NEXT: add x10, sp, #1344
6237 ; CHECK-SD-NEXT: add x11, sp, #960
6238 ; CHECK-SD-NEXT: ld1 { v3.b }[14], [x8]
6239 ; CHECK-SD-NEXT: ld1 { v7.b }[14], [x10]
6240 ; CHECK-SD-NEXT: ld1 { v16.b }[14], [x11]
6241 ; CHECK-SD-NEXT: add x9, sp, #584
6242 ; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v0.16b
6243 ; CHECK-SD-NEXT: add x8, sp, #200
6244 ; CHECK-SD-NEXT: sdot v4.4s, v17.16b, v6.16b
6245 ; CHECK-SD-NEXT: ld1 { v2.b }[15], [x9]
6246 ; CHECK-SD-NEXT: add x9, sp, #1352
6247 ; CHECK-SD-NEXT: add x10, sp, #968
6248 ; CHECK-SD-NEXT: ld1 { v3.b }[15], [x8]
6249 ; CHECK-SD-NEXT: ld1 { v7.b }[15], [x9]
6250 ; CHECK-SD-NEXT: ld1 { v16.b }[15], [x10]
6251 ; CHECK-SD-NEXT: sdot v5.4s, v3.16b, v2.16b
6252 ; CHECK-SD-NEXT: sdot v4.4s, v16.16b, v7.16b
6253 ; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
6254 ; CHECK-SD-NEXT: addv s0, v0.4s
6255 ; CHECK-SD-NEXT: fmov w0, s0
6256 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
6257 ; CHECK-SD-NEXT: ret
6259 ; CHECK-GI-LABEL: test_sdot_v48i8_double:
6260 ; CHECK-GI: // %bb.0: // %entry
6261 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
6262 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
6263 ; CHECK-GI-NEXT: .cfi_offset w29, -16
6264 ; CHECK-GI-NEXT: ldr w11, [sp, #80]
6265 ; CHECK-GI-NEXT: ldr w8, [sp, #208]
6266 ; CHECK-GI-NEXT: fmov s0, w0
6267 ; CHECK-GI-NEXT: ldr w12, [sp, #336]
6268 ; CHECK-GI-NEXT: ldr w9, [sp, #88]
6269 ; CHECK-GI-NEXT: mov v20.s[0], wzr
6270 ; CHECK-GI-NEXT: fmov s1, w11
6271 ; CHECK-GI-NEXT: fmov s2, w8
6272 ; CHECK-GI-NEXT: ldr w11, [sp, #464]
6273 ; CHECK-GI-NEXT: ldr w8, [sp, #592]
6274 ; CHECK-GI-NEXT: ldr w10, [sp, #216]
6275 ; CHECK-GI-NEXT: fmov s3, w12
6276 ; CHECK-GI-NEXT: fmov s4, w11
6277 ; CHECK-GI-NEXT: mov v0.b[1], w1
6278 ; CHECK-GI-NEXT: ldr w11, [sp, #600]
6279 ; CHECK-GI-NEXT: fmov s5, w8
6280 ; CHECK-GI-NEXT: mov v1.b[1], w9
6281 ; CHECK-GI-NEXT: mov v2.b[1], w10
6282 ; CHECK-GI-NEXT: ldr w10, [sp, #344]
6283 ; CHECK-GI-NEXT: ldr w9, [sp, #472]
6284 ; CHECK-GI-NEXT: ldr w8, [sp, #96]
6285 ; CHECK-GI-NEXT: ldr w12, [sp, #848]
6286 ; CHECK-GI-NEXT: ldr w13, [sp, #728]
6287 ; CHECK-GI-NEXT: mov v20.s[1], wzr
6288 ; CHECK-GI-NEXT: mov v3.b[1], w10
6289 ; CHECK-GI-NEXT: mov v4.b[1], w9
6290 ; CHECK-GI-NEXT: mov v5.b[1], w11
6291 ; CHECK-GI-NEXT: ldr w9, [sp, #224]
6292 ; CHECK-GI-NEXT: mov v0.b[2], w2
6293 ; CHECK-GI-NEXT: ldr w10, [sp, #352]
6294 ; CHECK-GI-NEXT: ldr w11, [sp, #480]
6295 ; CHECK-GI-NEXT: mov v1.b[2], w8
6296 ; CHECK-GI-NEXT: ldr w8, [sp, #608]
6297 ; CHECK-GI-NEXT: mov v2.b[2], w9
6298 ; CHECK-GI-NEXT: ldr w9, [sp, #104]
6299 ; CHECK-GI-NEXT: fmov s7, w12
6300 ; CHECK-GI-NEXT: mov v3.b[2], w10
6301 ; CHECK-GI-NEXT: mov v4.b[2], w11
6302 ; CHECK-GI-NEXT: mov v5.b[2], w8
6303 ; CHECK-GI-NEXT: ldr w10, [sp, #232]
6304 ; CHECK-GI-NEXT: mov v0.b[3], w3
6305 ; CHECK-GI-NEXT: ldr w8, [sp, #360]
6306 ; CHECK-GI-NEXT: ldr w11, [sp, #488]
6307 ; CHECK-GI-NEXT: mov v1.b[3], w9
6308 ; CHECK-GI-NEXT: ldr w9, [sp, #616]
6309 ; CHECK-GI-NEXT: mov v2.b[3], w10
6310 ; CHECK-GI-NEXT: ldr w10, [sp, #240]
6311 ; CHECK-GI-NEXT: ldr w12, [sp, #1232]
6312 ; CHECK-GI-NEXT: mov v3.b[3], w8
6313 ; CHECK-GI-NEXT: mov v4.b[3], w11
6314 ; CHECK-GI-NEXT: ldr w8, [sp, #112]
6315 ; CHECK-GI-NEXT: mov v5.b[3], w9
6316 ; CHECK-GI-NEXT: mov v0.b[4], w4
6317 ; CHECK-GI-NEXT: ldr w9, [sp, #368]
6318 ; CHECK-GI-NEXT: ldr w11, [sp, #496]
6319 ; CHECK-GI-NEXT: mov v1.b[4], w8
6320 ; CHECK-GI-NEXT: ldr w8, [sp, #624]
6321 ; CHECK-GI-NEXT: mov v2.b[4], w10
6322 ; CHECK-GI-NEXT: ldr w10, [sp, #248]
6323 ; CHECK-GI-NEXT: fmov s18, w12
6324 ; CHECK-GI-NEXT: mov v3.b[4], w9
6325 ; CHECK-GI-NEXT: mov v4.b[4], w11
6326 ; CHECK-GI-NEXT: ldr w9, [sp, #120]
6327 ; CHECK-GI-NEXT: mov v5.b[4], w8
6328 ; CHECK-GI-NEXT: mov v0.b[5], w5
6329 ; CHECK-GI-NEXT: ldr w8, [sp, #376]
6330 ; CHECK-GI-NEXT: ldr w11, [sp, #504]
6331 ; CHECK-GI-NEXT: mov v1.b[5], w9
6332 ; CHECK-GI-NEXT: ldr w9, [sp, #632]
6333 ; CHECK-GI-NEXT: mov v2.b[5], w10
6334 ; CHECK-GI-NEXT: ldr w10, [sp, #256]
6335 ; CHECK-GI-NEXT: ldr w12, [sp, #744]
6336 ; CHECK-GI-NEXT: mov v3.b[5], w8
6337 ; CHECK-GI-NEXT: mov v4.b[5], w11
6338 ; CHECK-GI-NEXT: ldr w8, [sp, #128]
6339 ; CHECK-GI-NEXT: mov v5.b[5], w9
6340 ; CHECK-GI-NEXT: mov v0.b[6], w6
6341 ; CHECK-GI-NEXT: ldr w9, [sp, #384]
6342 ; CHECK-GI-NEXT: ldr w11, [sp, #512]
6343 ; CHECK-GI-NEXT: mov v1.b[6], w8
6344 ; CHECK-GI-NEXT: ldr w8, [sp, #640]
6345 ; CHECK-GI-NEXT: mov v2.b[6], w10
6346 ; CHECK-GI-NEXT: ldr w10, [sp, #264]
6347 ; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
6348 ; CHECK-GI-NEXT: mov v3.b[6], w9
6349 ; CHECK-GI-NEXT: mov v4.b[6], w11
6350 ; CHECK-GI-NEXT: ldr w9, [sp, #136]
6351 ; CHECK-GI-NEXT: mov v5.b[6], w8
6352 ; CHECK-GI-NEXT: mov v0.b[7], w7
6353 ; CHECK-GI-NEXT: ldr w8, [sp, #392]
6354 ; CHECK-GI-NEXT: ldr w11, [sp, #520]
6355 ; CHECK-GI-NEXT: mov v1.b[7], w9
6356 ; CHECK-GI-NEXT: ldr w9, [sp, #648]
6357 ; CHECK-GI-NEXT: mov v2.b[7], w10
6358 ; CHECK-GI-NEXT: ldr w10, [sp, #16]
6359 ; CHECK-GI-NEXT: movi v22.2d, #0000000000000000
6360 ; CHECK-GI-NEXT: mov v3.b[7], w8
6361 ; CHECK-GI-NEXT: mov v4.b[7], w11
6362 ; CHECK-GI-NEXT: ldr w8, [sp, #144]
6363 ; CHECK-GI-NEXT: mov v5.b[7], w9
6364 ; CHECK-GI-NEXT: ldr w11, [sp, #272]
6365 ; CHECK-GI-NEXT: mov v0.b[8], w10
6366 ; CHECK-GI-NEXT: ldr w9, [sp, #400]
6367 ; CHECK-GI-NEXT: ldr w10, [sp, #528]
6368 ; CHECK-GI-NEXT: mov v1.b[8], w8
6369 ; CHECK-GI-NEXT: ldr w8, [sp, #656]
6370 ; CHECK-GI-NEXT: mov v2.b[8], w11
6371 ; CHECK-GI-NEXT: ldr w11, [sp, #24]
6372 ; CHECK-GI-NEXT: mov v3.b[8], w9
6373 ; CHECK-GI-NEXT: mov v4.b[8], w10
6374 ; CHECK-GI-NEXT: ldr w9, [sp, #152]
6375 ; CHECK-GI-NEXT: mov v5.b[8], w8
6376 ; CHECK-GI-NEXT: ldr w10, [sp, #280]
6377 ; CHECK-GI-NEXT: mov v0.b[9], w11
6378 ; CHECK-GI-NEXT: ldr w8, [sp, #408]
6379 ; CHECK-GI-NEXT: ldr w11, [sp, #536]
6380 ; CHECK-GI-NEXT: mov v1.b[9], w9
6381 ; CHECK-GI-NEXT: ldr w9, [sp, #664]
6382 ; CHECK-GI-NEXT: mov v2.b[9], w10
6383 ; CHECK-GI-NEXT: ldr w10, [sp, #32]
6384 ; CHECK-GI-NEXT: mov v3.b[9], w8
6385 ; CHECK-GI-NEXT: mov v4.b[9], w11
6386 ; CHECK-GI-NEXT: ldr w8, [sp, #160]
6387 ; CHECK-GI-NEXT: mov v5.b[9], w9
6388 ; CHECK-GI-NEXT: ldr w11, [sp, #288]
6389 ; CHECK-GI-NEXT: mov v0.b[10], w10
6390 ; CHECK-GI-NEXT: ldr w9, [sp, #416]
6391 ; CHECK-GI-NEXT: ldr w10, [sp, #544]
6392 ; CHECK-GI-NEXT: mov v1.b[10], w8
6393 ; CHECK-GI-NEXT: ldr w8, [sp, #672]
6394 ; CHECK-GI-NEXT: mov v2.b[10], w11
6395 ; CHECK-GI-NEXT: ldr w11, [sp, #40]
6396 ; CHECK-GI-NEXT: mov v3.b[10], w9
6397 ; CHECK-GI-NEXT: mov v4.b[10], w10
6398 ; CHECK-GI-NEXT: ldr w9, [sp, #168]
6399 ; CHECK-GI-NEXT: mov v5.b[10], w8
6400 ; CHECK-GI-NEXT: ldr w10, [sp, #296]
6401 ; CHECK-GI-NEXT: mov v0.b[11], w11
6402 ; CHECK-GI-NEXT: ldr w8, [sp, #424]
6403 ; CHECK-GI-NEXT: ldr w11, [sp, #552]
6404 ; CHECK-GI-NEXT: mov v1.b[11], w9
6405 ; CHECK-GI-NEXT: ldr w9, [sp, #680]
6406 ; CHECK-GI-NEXT: mov v2.b[11], w10
6407 ; CHECK-GI-NEXT: ldr w10, [sp, #48]
6408 ; CHECK-GI-NEXT: mov v3.b[11], w8
6409 ; CHECK-GI-NEXT: mov v4.b[11], w11
6410 ; CHECK-GI-NEXT: ldr w8, [sp, #176]
6411 ; CHECK-GI-NEXT: mov v5.b[11], w9
6412 ; CHECK-GI-NEXT: ldr w11, [sp, #304]
6413 ; CHECK-GI-NEXT: mov v0.b[12], w10
6414 ; CHECK-GI-NEXT: ldr w9, [sp, #432]
6415 ; CHECK-GI-NEXT: ldr w10, [sp, #560]
6416 ; CHECK-GI-NEXT: mov v1.b[12], w8
6417 ; CHECK-GI-NEXT: ldr w8, [sp, #688]
6418 ; CHECK-GI-NEXT: mov v2.b[12], w11
6419 ; CHECK-GI-NEXT: ldr w11, [sp, #56]
6420 ; CHECK-GI-NEXT: mov v3.b[12], w9
6421 ; CHECK-GI-NEXT: mov v4.b[12], w10
6422 ; CHECK-GI-NEXT: ldr w9, [sp, #184]
6423 ; CHECK-GI-NEXT: mov v5.b[12], w8
6424 ; CHECK-GI-NEXT: ldr w10, [sp, #312]
6425 ; CHECK-GI-NEXT: mov v0.b[13], w11
6426 ; CHECK-GI-NEXT: ldr w8, [sp, #440]
6427 ; CHECK-GI-NEXT: ldr w11, [sp, #568]
6428 ; CHECK-GI-NEXT: mov v1.b[13], w9
6429 ; CHECK-GI-NEXT: ldr w9, [sp, #696]
6430 ; CHECK-GI-NEXT: mov v2.b[13], w10
6431 ; CHECK-GI-NEXT: ldr w10, [sp, #64]
6432 ; CHECK-GI-NEXT: mov v3.b[13], w8
6433 ; CHECK-GI-NEXT: mov v4.b[13], w11
6434 ; CHECK-GI-NEXT: ldr w8, [sp, #192]
6435 ; CHECK-GI-NEXT: mov v5.b[13], w9
6436 ; CHECK-GI-NEXT: ldr w11, [sp, #320]
6437 ; CHECK-GI-NEXT: mov v0.b[14], w10
6438 ; CHECK-GI-NEXT: ldr w9, [sp, #448]
6439 ; CHECK-GI-NEXT: ldr w10, [sp, #576]
6440 ; CHECK-GI-NEXT: mov v1.b[14], w8
6441 ; CHECK-GI-NEXT: ldr w8, [sp, #704]
6442 ; CHECK-GI-NEXT: mov v2.b[14], w11
6443 ; CHECK-GI-NEXT: ldr w11, [sp, #72]
6444 ; CHECK-GI-NEXT: mov v3.b[14], w9
6445 ; CHECK-GI-NEXT: mov v4.b[14], w10
6446 ; CHECK-GI-NEXT: ldr w9, [sp, #200]
6447 ; CHECK-GI-NEXT: mov v5.b[14], w8
6448 ; CHECK-GI-NEXT: ldr w10, [sp, #328]
6449 ; CHECK-GI-NEXT: mov v0.b[15], w11
6450 ; CHECK-GI-NEXT: ldr w11, [sp, #584]
6451 ; CHECK-GI-NEXT: mov v1.b[15], w9
6452 ; CHECK-GI-NEXT: ldr w9, [sp, #712]
6453 ; CHECK-GI-NEXT: mov v2.b[15], w10
6454 ; CHECK-GI-NEXT: ldr w10, [sp, #720]
6455 ; CHECK-GI-NEXT: ldr w8, [sp, #456]
6456 ; CHECK-GI-NEXT: mov v4.b[15], w11
6457 ; CHECK-GI-NEXT: ldr w11, [sp, #976]
6458 ; CHECK-GI-NEXT: movi v23.2d, #0000000000000000
6459 ; CHECK-GI-NEXT: mov v5.b[15], w9
6460 ; CHECK-GI-NEXT: ldr w9, [sp, #856]
6461 ; CHECK-GI-NEXT: fmov s6, w10
6462 ; CHECK-GI-NEXT: fmov s16, w11
6463 ; CHECK-GI-NEXT: ldr w11, [sp, #1104]
6464 ; CHECK-GI-NEXT: ldr w10, [sp, #984]
6465 ; CHECK-GI-NEXT: mov v7.b[1], w9
6466 ; CHECK-GI-NEXT: ldr w9, [sp, #1360]
6467 ; CHECK-GI-NEXT: mov v3.b[15], w8
6468 ; CHECK-GI-NEXT: fmov s17, w11
6469 ; CHECK-GI-NEXT: mov v6.b[1], w13
6470 ; CHECK-GI-NEXT: ldr w13, [sp, #1112]
6471 ; CHECK-GI-NEXT: fmov s19, w9
6472 ; CHECK-GI-NEXT: mov v16.b[1], w10
6473 ; CHECK-GI-NEXT: ldr w10, [sp, #1240]
6474 ; CHECK-GI-NEXT: ldr w11, [sp, #1368]
6475 ; CHECK-GI-NEXT: ldr w8, [sp, #736]
6476 ; CHECK-GI-NEXT: ldr w9, [sp, #864]
6477 ; CHECK-GI-NEXT: mov v17.b[1], w13
6478 ; CHECK-GI-NEXT: mov v18.b[1], w10
6479 ; CHECK-GI-NEXT: ldr w10, [sp, #992]
6480 ; CHECK-GI-NEXT: mov v19.b[1], w11
6481 ; CHECK-GI-NEXT: mov v6.b[2], w8
6482 ; CHECK-GI-NEXT: ldr w8, [sp, #1120]
6483 ; CHECK-GI-NEXT: mov v7.b[2], w9
6484 ; CHECK-GI-NEXT: ldr w9, [sp, #1248]
6485 ; CHECK-GI-NEXT: ldr w11, [sp, #1376]
6486 ; CHECK-GI-NEXT: mov v16.b[2], w10
6487 ; CHECK-GI-NEXT: ldr w10, [sp, #752]
6488 ; CHECK-GI-NEXT: mov v20.s[2], wzr
6489 ; CHECK-GI-NEXT: mov v17.b[2], w8
6490 ; CHECK-GI-NEXT: ldr w8, [sp, #872]
6491 ; CHECK-GI-NEXT: mov v18.b[2], w9
6492 ; CHECK-GI-NEXT: mov v19.b[2], w11
6493 ; CHECK-GI-NEXT: ldr w9, [sp, #1000]
6494 ; CHECK-GI-NEXT: mov v6.b[3], w12
6495 ; CHECK-GI-NEXT: ldr w11, [sp, #1128]
6496 ; CHECK-GI-NEXT: mov v7.b[3], w8
6497 ; CHECK-GI-NEXT: ldr w8, [sp, #1256]
6498 ; CHECK-GI-NEXT: ldr w12, [sp, #1384]
6499 ; CHECK-GI-NEXT: mov v16.b[3], w9
6500 ; CHECK-GI-NEXT: ldr w9, [sp, #760]
6501 ; CHECK-GI-NEXT: mov v17.b[3], w11
6502 ; CHECK-GI-NEXT: ldr w11, [sp, #880]
6503 ; CHECK-GI-NEXT: mov v18.b[3], w8
6504 ; CHECK-GI-NEXT: mov v19.b[3], w12
6505 ; CHECK-GI-NEXT: ldr w8, [sp, #1008]
6506 ; CHECK-GI-NEXT: mov v6.b[4], w10
6507 ; CHECK-GI-NEXT: ldr w10, [sp, #1136]
6508 ; CHECK-GI-NEXT: mov v7.b[4], w11
6509 ; CHECK-GI-NEXT: ldr w11, [sp, #1264]
6510 ; CHECK-GI-NEXT: ldr w12, [sp, #1392]
6511 ; CHECK-GI-NEXT: mov v16.b[4], w8
6512 ; CHECK-GI-NEXT: ldr w8, [sp, #768]
6513 ; CHECK-GI-NEXT: mov v17.b[4], w10
6514 ; CHECK-GI-NEXT: ldr w10, [sp, #888]
6515 ; CHECK-GI-NEXT: mov v18.b[4], w11
6516 ; CHECK-GI-NEXT: mov v19.b[4], w12
6517 ; CHECK-GI-NEXT: ldr w11, [sp, #1016]
6518 ; CHECK-GI-NEXT: mov v6.b[5], w9
6519 ; CHECK-GI-NEXT: ldr w9, [sp, #1144]
6520 ; CHECK-GI-NEXT: mov v7.b[5], w10
6521 ; CHECK-GI-NEXT: ldr w10, [sp, #1272]
6522 ; CHECK-GI-NEXT: ldr w12, [sp, #1400]
6523 ; CHECK-GI-NEXT: mov v16.b[5], w11
6524 ; CHECK-GI-NEXT: ldr w11, [sp, #776]
6525 ; CHECK-GI-NEXT: mov v17.b[5], w9
6526 ; CHECK-GI-NEXT: ldr w9, [sp, #896]
6527 ; CHECK-GI-NEXT: mov v18.b[5], w10
6528 ; CHECK-GI-NEXT: mov v19.b[5], w12
6529 ; CHECK-GI-NEXT: ldr w10, [sp, #1024]
6530 ; CHECK-GI-NEXT: mov v6.b[6], w8
6531 ; CHECK-GI-NEXT: ldr w8, [sp, #1152]
6532 ; CHECK-GI-NEXT: mov v7.b[6], w9
6533 ; CHECK-GI-NEXT: ldr w9, [sp, #1280]
6534 ; CHECK-GI-NEXT: ldr w12, [sp, #1408]
6535 ; CHECK-GI-NEXT: mov v16.b[6], w10
6536 ; CHECK-GI-NEXT: ldr w10, [sp, #784]
6537 ; CHECK-GI-NEXT: mov v17.b[6], w8
6538 ; CHECK-GI-NEXT: ldr w8, [sp, #904]
6539 ; CHECK-GI-NEXT: mov v18.b[6], w9
6540 ; CHECK-GI-NEXT: mov v19.b[6], w12
6541 ; CHECK-GI-NEXT: ldr w9, [sp, #1032]
6542 ; CHECK-GI-NEXT: mov v6.b[7], w11
6543 ; CHECK-GI-NEXT: ldr w11, [sp, #1160]
6544 ; CHECK-GI-NEXT: mov v7.b[7], w8
6545 ; CHECK-GI-NEXT: ldr w8, [sp, #1288]
6546 ; CHECK-GI-NEXT: ldr w12, [sp, #1416]
6547 ; CHECK-GI-NEXT: mov v16.b[7], w9
6548 ; CHECK-GI-NEXT: ldr w9, [sp, #792]
6549 ; CHECK-GI-NEXT: mov v17.b[7], w11
6550 ; CHECK-GI-NEXT: ldr w11, [sp, #912]
6551 ; CHECK-GI-NEXT: mov v18.b[7], w8
6552 ; CHECK-GI-NEXT: mov v19.b[7], w12
6553 ; CHECK-GI-NEXT: ldr w8, [sp, #1040]
6554 ; CHECK-GI-NEXT: mov v6.b[8], w10
6555 ; CHECK-GI-NEXT: ldr w10, [sp, #1168]
6556 ; CHECK-GI-NEXT: mov v7.b[8], w11
6557 ; CHECK-GI-NEXT: ldr w11, [sp, #1296]
6558 ; CHECK-GI-NEXT: ldr w12, [sp, #1424]
6559 ; CHECK-GI-NEXT: mov v16.b[8], w8
6560 ; CHECK-GI-NEXT: ldr w8, [sp, #800]
6561 ; CHECK-GI-NEXT: mov v17.b[8], w10
6562 ; CHECK-GI-NEXT: ldr w10, [sp, #920]
6563 ; CHECK-GI-NEXT: mov v18.b[8], w11
6564 ; CHECK-GI-NEXT: mov v19.b[8], w12
6565 ; CHECK-GI-NEXT: ldr w11, [sp, #1048]
6566 ; CHECK-GI-NEXT: mov v6.b[9], w9
6567 ; CHECK-GI-NEXT: ldr w9, [sp, #1176]
6568 ; CHECK-GI-NEXT: mov v7.b[9], w10
6569 ; CHECK-GI-NEXT: ldr w10, [sp, #1304]
6570 ; CHECK-GI-NEXT: ldr w12, [sp, #1432]
6571 ; CHECK-GI-NEXT: mov v16.b[9], w11
6572 ; CHECK-GI-NEXT: ldr w11, [sp, #808]
6573 ; CHECK-GI-NEXT: mov v17.b[9], w9
6574 ; CHECK-GI-NEXT: ldr w9, [sp, #928]
6575 ; CHECK-GI-NEXT: mov v18.b[9], w10
6576 ; CHECK-GI-NEXT: mov v19.b[9], w12
6577 ; CHECK-GI-NEXT: ldr w10, [sp, #1056]
6578 ; CHECK-GI-NEXT: mov v6.b[10], w8
6579 ; CHECK-GI-NEXT: ldr w8, [sp, #1184]
6580 ; CHECK-GI-NEXT: mov v7.b[10], w9
6581 ; CHECK-GI-NEXT: ldr w9, [sp, #1312]
6582 ; CHECK-GI-NEXT: ldr w12, [sp, #1440]
6583 ; CHECK-GI-NEXT: mov v16.b[10], w10
6584 ; CHECK-GI-NEXT: ldr w10, [sp, #816]
6585 ; CHECK-GI-NEXT: mov v17.b[10], w8
6586 ; CHECK-GI-NEXT: ldr w8, [sp, #936]
6587 ; CHECK-GI-NEXT: mov v18.b[10], w9
6588 ; CHECK-GI-NEXT: mov v19.b[10], w12
6589 ; CHECK-GI-NEXT: ldr w9, [sp, #1064]
6590 ; CHECK-GI-NEXT: mov v6.b[11], w11
6591 ; CHECK-GI-NEXT: ldr w11, [sp, #1192]
6592 ; CHECK-GI-NEXT: mov v7.b[11], w8
6593 ; CHECK-GI-NEXT: ldr w8, [sp, #1320]
6594 ; CHECK-GI-NEXT: ldr w12, [sp, #1448]
6595 ; CHECK-GI-NEXT: mov v16.b[11], w9
6596 ; CHECK-GI-NEXT: ldr w9, [sp, #824]
6597 ; CHECK-GI-NEXT: mov v17.b[11], w11
6598 ; CHECK-GI-NEXT: ldr w11, [sp, #944]
6599 ; CHECK-GI-NEXT: mov v18.b[11], w8
6600 ; CHECK-GI-NEXT: mov v19.b[11], w12
6601 ; CHECK-GI-NEXT: ldr w8, [sp, #1072]
6602 ; CHECK-GI-NEXT: mov v6.b[12], w10
6603 ; CHECK-GI-NEXT: ldr w10, [sp, #1200]
6604 ; CHECK-GI-NEXT: mov v7.b[12], w11
6605 ; CHECK-GI-NEXT: ldr w11, [sp, #1328]
6606 ; CHECK-GI-NEXT: ldr w12, [sp, #1456]
6607 ; CHECK-GI-NEXT: mov v16.b[12], w8
6608 ; CHECK-GI-NEXT: ldr w8, [sp, #832]
6609 ; CHECK-GI-NEXT: mov v17.b[12], w10
6610 ; CHECK-GI-NEXT: ldr w10, [sp, #952]
6611 ; CHECK-GI-NEXT: mov v18.b[12], w11
6612 ; CHECK-GI-NEXT: mov v19.b[12], w12
6613 ; CHECK-GI-NEXT: ldr w11, [sp, #1080]
6614 ; CHECK-GI-NEXT: mov v6.b[13], w9
6615 ; CHECK-GI-NEXT: ldr w9, [sp, #1208]
6616 ; CHECK-GI-NEXT: mov v7.b[13], w10
6617 ; CHECK-GI-NEXT: ldr w10, [sp, #1336]
6618 ; CHECK-GI-NEXT: ldr w12, [sp, #1464]
6619 ; CHECK-GI-NEXT: mov v16.b[13], w11
6620 ; CHECK-GI-NEXT: ldr w11, [sp, #960]
6621 ; CHECK-GI-NEXT: mov v17.b[13], w9
6622 ; CHECK-GI-NEXT: mov v18.b[13], w10
6623 ; CHECK-GI-NEXT: ldr w9, [sp, #1088]
6624 ; CHECK-GI-NEXT: mov v19.b[13], w12
6625 ; CHECK-GI-NEXT: mov v6.b[14], w8
6626 ; CHECK-GI-NEXT: ldr w8, [sp, #1216]
6627 ; CHECK-GI-NEXT: mov v7.b[14], w11
6628 ; CHECK-GI-NEXT: ldr w10, [sp, #1344]
6629 ; CHECK-GI-NEXT: ldr w11, [sp, #1472]
6630 ; CHECK-GI-NEXT: ldr w12, [sp, #840]
6631 ; CHECK-GI-NEXT: mov v16.b[14], w9
6632 ; CHECK-GI-NEXT: ldr w9, [sp, #1096]
6633 ; CHECK-GI-NEXT: mov v17.b[14], w8
6634 ; CHECK-GI-NEXT: mov v18.b[14], w10
6635 ; CHECK-GI-NEXT: ldr w8, [sp, #968]
6636 ; CHECK-GI-NEXT: mov v19.b[14], w11
6637 ; CHECK-GI-NEXT: ldr w10, [sp, #1224]
6638 ; CHECK-GI-NEXT: mov v6.b[15], w12
6639 ; CHECK-GI-NEXT: ldr w11, [sp, #1352]
6640 ; CHECK-GI-NEXT: ldr w12, [sp, #1480]
6641 ; CHECK-GI-NEXT: mov v7.b[15], w8
6642 ; CHECK-GI-NEXT: mov v16.b[15], w9
6643 ; CHECK-GI-NEXT: movi v24.2d, #0000000000000000
6644 ; CHECK-GI-NEXT: movi v25.2d, #0000000000000000
6645 ; CHECK-GI-NEXT: mov v17.b[15], w10
6646 ; CHECK-GI-NEXT: mov v18.b[15], w11
6647 ; CHECK-GI-NEXT: movi v26.2d, #0000000000000000
6648 ; CHECK-GI-NEXT: mov v19.b[15], w12
6649 ; CHECK-GI-NEXT: sdot v21.4s, v0.16b, v3.16b
6650 ; CHECK-GI-NEXT: sdot v22.4s, v1.16b, v4.16b
6651 ; CHECK-GI-NEXT: sdot v23.4s, v2.16b, v5.16b
6652 ; CHECK-GI-NEXT: mov v20.s[3], wzr
6653 ; CHECK-GI-NEXT: sdot v25.4s, v6.16b, v17.16b
6654 ; CHECK-GI-NEXT: sdot v26.4s, v7.16b, v18.16b
6655 ; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v19.16b
6656 ; CHECK-GI-NEXT: add v0.4s, v21.4s, v22.4s
6657 ; CHECK-GI-NEXT: add v1.4s, v23.4s, v20.4s
6658 ; CHECK-GI-NEXT: add v2.4s, v25.4s, v26.4s
6659 ; CHECK-GI-NEXT: add v3.4s, v24.4s, v20.4s
6660 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
6661 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
6662 ; CHECK-GI-NEXT: addv s0, v0.4s
6663 ; CHECK-GI-NEXT: addv s1, v1.4s
6664 ; CHECK-GI-NEXT: fmov w8, s0
6665 ; CHECK-GI-NEXT: fmov w9, s1
6666 ; CHECK-GI-NEXT: add w0, w8, w9
6667 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
6668 ; CHECK-GI-NEXT: ret
6670 %az = sext <48 x i8> %a to <48 x i32>
6671 %bz = sext <48 x i8> %b to <48 x i32>
6672 %m1 = mul nuw nsw <48 x i32> %az, %bz
6673 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m1)
6674 %cz = sext <48 x i8> %c to <48 x i32>
6675 %dz = sext <48 x i8> %d to <48 x i32>
6676 %m2 = mul nuw nsw <48 x i32> %cz, %dz
6677 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m2)
6678 %x = add i32 %r1, %r2
6682 define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
6683 ; CHECK-SD-LABEL: test_sdot_v48i8_double_nomla:
6684 ; CHECK-SD: // %bb.0: // %entry
6685 ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
6686 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
6687 ; CHECK-SD-NEXT: .cfi_offset w29, -16
6688 ; CHECK-SD-NEXT: ldr b5, [sp, #208]
6689 ; CHECK-SD-NEXT: add x8, sp, #216
6690 ; CHECK-SD-NEXT: fmov s0, w0
6691 ; CHECK-SD-NEXT: ldr b4, [sp, #976]
6692 ; CHECK-SD-NEXT: add x9, sp, #984
6693 ; CHECK-SD-NEXT: add x12, sp, #328
6694 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8]
6695 ; CHECK-SD-NEXT: add x8, sp, #224
6696 ; CHECK-SD-NEXT: movi v1.16b, #1
6697 ; CHECK-SD-NEXT: mov v0.b[1], w1
6698 ; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9]
6699 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
6700 ; CHECK-SD-NEXT: add x11, sp, #992
6701 ; CHECK-SD-NEXT: ldr b6, [sp, #720]
6702 ; CHECK-SD-NEXT: ldr b7, [sp, #80]
6703 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
6704 ; CHECK-SD-NEXT: add x8, sp, #232
6705 ; CHECK-SD-NEXT: add x13, sp, #88
6706 ; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11]
6707 ; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13]
6708 ; CHECK-SD-NEXT: add x13, sp, #856
6709 ; CHECK-SD-NEXT: mov v0.b[2], w2
6710 ; CHECK-SD-NEXT: add x14, sp, #1008
6711 ; CHECK-SD-NEXT: add x15, sp, #872
6712 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8]
6713 ; CHECK-SD-NEXT: add x8, sp, #240
6714 ; CHECK-SD-NEXT: add x16, sp, #888
6715 ; CHECK-SD-NEXT: add x10, sp, #16
6716 ; CHECK-SD-NEXT: add x9, sp, #24
6717 ; CHECK-SD-NEXT: add x11, sp, #40
6718 ; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
6719 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8]
6720 ; CHECK-SD-NEXT: add x8, sp, #248
6721 ; CHECK-SD-NEXT: mov v0.b[3], w3
6722 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8]
6723 ; CHECK-SD-NEXT: add x8, sp, #256
6724 ; CHECK-SD-NEXT: mov v0.b[4], w4
6725 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8]
6726 ; CHECK-SD-NEXT: add x8, sp, #264
6727 ; CHECK-SD-NEXT: mov v0.b[5], w5
6728 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8]
6729 ; CHECK-SD-NEXT: add x8, sp, #272
6730 ; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8]
6731 ; CHECK-SD-NEXT: add x8, sp, #280
6732 ; CHECK-SD-NEXT: mov v0.b[6], w6
6733 ; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8]
6734 ; CHECK-SD-NEXT: add x8, sp, #288
6735 ; CHECK-SD-NEXT: mov v0.b[7], w7
6736 ; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8]
6737 ; CHECK-SD-NEXT: add x8, sp, #296
6738 ; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10]
6739 ; CHECK-SD-NEXT: add x10, sp, #128
6740 ; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8]
6741 ; CHECK-SD-NEXT: add x8, sp, #304
6742 ; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9]
6743 ; CHECK-SD-NEXT: add x9, sp, #136
6744 ; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8]
6745 ; CHECK-SD-NEXT: add x8, sp, #312
6746 ; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8]
6747 ; CHECK-SD-NEXT: add x8, sp, #320
6748 ; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8]
6749 ; CHECK-SD-NEXT: add x8, sp, #32
6750 ; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8]
6751 ; CHECK-SD-NEXT: add x8, sp, #144
6752 ; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12]
6753 ; CHECK-SD-NEXT: add x12, sp, #728
6754 ; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12]
6755 ; CHECK-SD-NEXT: add x12, sp, #1000
6756 ; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11]
6757 ; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12]
6758 ; CHECK-SD-NEXT: add x12, sp, #736
6759 ; CHECK-SD-NEXT: add x11, sp, #920
6760 ; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b
6761 ; CHECK-SD-NEXT: ldr b5, [sp, #848]
6762 ; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12]
6763 ; CHECK-SD-NEXT: add x12, sp, #48
6764 ; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13]
6765 ; CHECK-SD-NEXT: add x13, sp, #744
6766 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14]
6767 ; CHECK-SD-NEXT: add x14, sp, #96
6768 ; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12]
6769 ; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13]
6770 ; CHECK-SD-NEXT: add x13, sp, #864
6771 ; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14]
6772 ; CHECK-SD-NEXT: add x14, sp, #1016
6773 ; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13]
6774 ; CHECK-SD-NEXT: add x13, sp, #752
6775 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14]
6776 ; CHECK-SD-NEXT: add x14, sp, #104
6777 ; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13]
6778 ; CHECK-SD-NEXT: add x13, sp, #1024
6779 ; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14]
6780 ; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15]
6781 ; CHECK-SD-NEXT: add x15, sp, #760
6782 ; CHECK-SD-NEXT: add x14, sp, #112
6783 ; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13]
6784 ; CHECK-SD-NEXT: add x13, sp, #880
6785 ; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15]
6786 ; CHECK-SD-NEXT: add x15, sp, #1032
6787 ; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14]
6788 ; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13]
6789 ; CHECK-SD-NEXT: add x14, sp, #768
6790 ; CHECK-SD-NEXT: add x13, sp, #120
6791 ; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15]
6792 ; CHECK-SD-NEXT: add x15, sp, #1040
6793 ; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14]
6794 ; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13]
6795 ; CHECK-SD-NEXT: add x13, sp, #776
6796 ; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16]
6797 ; CHECK-SD-NEXT: add x14, sp, #1048
6798 ; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15]
6799 ; CHECK-SD-NEXT: add x15, sp, #896
6800 ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13]
6801 ; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
6802 ; CHECK-SD-NEXT: add x10, sp, #784
6803 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15]
6804 ; CHECK-SD-NEXT: add x13, sp, #1056
6805 ; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14]
6806 ; CHECK-SD-NEXT: add x14, sp, #904
6807 ; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10]
6808 ; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9]
6809 ; CHECK-SD-NEXT: add x9, sp, #792
6810 ; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14]
6811 ; CHECK-SD-NEXT: add x10, sp, #1064
6812 ; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13]
6813 ; CHECK-SD-NEXT: add x13, sp, #912
6814 ; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9]
6815 ; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8]
6816 ; CHECK-SD-NEXT: add x9, sp, #800
6817 ; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13]
6818 ; CHECK-SD-NEXT: add x8, sp, #152
6819 ; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10]
6820 ; CHECK-SD-NEXT: add x10, sp, #1072
6821 ; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9]
6822 ; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8]
6823 ; CHECK-SD-NEXT: add x9, sp, #808
6824 ; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11]
6825 ; CHECK-SD-NEXT: add x8, sp, #56
6826 ; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10]
6827 ; CHECK-SD-NEXT: add x10, sp, #160
6828 ; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8]
6829 ; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9]
6830 ; CHECK-SD-NEXT: add x9, sp, #928
6831 ; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10]
6832 ; CHECK-SD-NEXT: add x10, sp, #1080
6833 ; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9]
6834 ; CHECK-SD-NEXT: add x8, sp, #816
6835 ; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10]
6836 ; CHECK-SD-NEXT: add x9, sp, #168
6837 ; CHECK-SD-NEXT: add x10, sp, #176
6838 ; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8]
6839 ; CHECK-SD-NEXT: add x8, sp, #936
6840 ; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9]
6841 ; CHECK-SD-NEXT: add x9, sp, #1088
6842 ; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8]
6843 ; CHECK-SD-NEXT: add x8, sp, #64
6844 ; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9]
6845 ; CHECK-SD-NEXT: add x9, sp, #824
6846 ; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8]
6847 ; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9]
6848 ; CHECK-SD-NEXT: add x9, sp, #944
6849 ; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10]
6850 ; CHECK-SD-NEXT: add x10, sp, #1096
6851 ; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9]
6852 ; CHECK-SD-NEXT: add x8, sp, #832
6853 ; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10]
6854 ; CHECK-SD-NEXT: add x9, sp, #184
6855 ; CHECK-SD-NEXT: add x10, sp, #72
6856 ; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8]
6857 ; CHECK-SD-NEXT: add x8, sp, #952
6858 ; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9]
6859 ; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8]
6860 ; CHECK-SD-NEXT: add x8, sp, #840
6861 ; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10]
6862 ; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b
6863 ; CHECK-SD-NEXT: add x9, sp, #192
6864 ; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8]
6865 ; CHECK-SD-NEXT: add x8, sp, #960
6866 ; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9]
6867 ; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8]
6868 ; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b
6869 ; CHECK-SD-NEXT: add x8, sp, #200
6870 ; CHECK-SD-NEXT: add x9, sp, #968
6871 ; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b
6872 ; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8]
6873 ; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9]
6874 ; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b
6875 ; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b
6876 ; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s
6877 ; CHECK-SD-NEXT: addv s0, v0.4s
6878 ; CHECK-SD-NEXT: fmov w0, s0
6879 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
6880 ; CHECK-SD-NEXT: ret
6882 ; CHECK-GI-LABEL: test_sdot_v48i8_double_nomla:
6883 ; CHECK-GI: // %bb.0: // %entry
6884 ; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
6885 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
6886 ; CHECK-GI-NEXT: .cfi_offset w29, -16
6887 ; CHECK-GI-NEXT: ldr w10, [sp, #80]
6888 ; CHECK-GI-NEXT: ldr w9, [sp, #88]
6889 ; CHECK-GI-NEXT: fmov s0, w0
6890 ; CHECK-GI-NEXT: ldr w11, [sp, #208]
6891 ; CHECK-GI-NEXT: ldr w8, [sp, #216]
6892 ; CHECK-GI-NEXT: mov v6.s[0], wzr
6893 ; CHECK-GI-NEXT: fmov s1, w10
6894 ; CHECK-GI-NEXT: ldr w10, [sp, #720]
6895 ; CHECK-GI-NEXT: movi v7.16b, #1
6896 ; CHECK-GI-NEXT: fmov s2, w11
6897 ; CHECK-GI-NEXT: mov v0.b[1], w1
6898 ; CHECK-GI-NEXT: ldr w11, [sp, #728]
6899 ; CHECK-GI-NEXT: fmov s3, w10
6900 ; CHECK-GI-NEXT: ldr w10, [sp, #976]
6901 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
6902 ; CHECK-GI-NEXT: mov v1.b[1], w9
6903 ; CHECK-GI-NEXT: ldr w9, [sp, #848]
6904 ; CHECK-GI-NEXT: mov v6.s[1], wzr
6905 ; CHECK-GI-NEXT: fmov s5, w10
6906 ; CHECK-GI-NEXT: mov v2.b[1], w8
6907 ; CHECK-GI-NEXT: ldr w8, [sp, #856]
6908 ; CHECK-GI-NEXT: fmov s4, w9
6909 ; CHECK-GI-NEXT: ldr w9, [sp, #984]
6910 ; CHECK-GI-NEXT: mov v3.b[1], w11
6911 ; CHECK-GI-NEXT: ldr w10, [sp, #224]
6912 ; CHECK-GI-NEXT: mov v0.b[2], w2
6913 ; CHECK-GI-NEXT: ldr w11, [sp, #16]
6914 ; CHECK-GI-NEXT: mov v5.b[1], w9
6915 ; CHECK-GI-NEXT: ldr w9, [sp, #864]
6916 ; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
6917 ; CHECK-GI-NEXT: mov v4.b[1], w8
6918 ; CHECK-GI-NEXT: ldr w8, [sp, #96]
6919 ; CHECK-GI-NEXT: mov v2.b[2], w10
6920 ; CHECK-GI-NEXT: ldr w10, [sp, #992]
6921 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
6922 ; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
6923 ; CHECK-GI-NEXT: mov v1.b[2], w8
6924 ; CHECK-GI-NEXT: ldr w8, [sp, #736]
6925 ; CHECK-GI-NEXT: mov v0.b[3], w3
6926 ; CHECK-GI-NEXT: mov v5.b[2], w10
6927 ; CHECK-GI-NEXT: ldr w10, [sp, #872]
6928 ; CHECK-GI-NEXT: movi v20.2d, #0000000000000000
6929 ; CHECK-GI-NEXT: mov v3.b[2], w8
6930 ; CHECK-GI-NEXT: mov v4.b[2], w9
6931 ; CHECK-GI-NEXT: ldr w8, [sp, #104]
6932 ; CHECK-GI-NEXT: ldr w9, [sp, #232]
6933 ; CHECK-GI-NEXT: mov v6.s[2], wzr
6934 ; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
6935 ; CHECK-GI-NEXT: mov v1.b[3], w8
6936 ; CHECK-GI-NEXT: ldr w8, [sp, #744]
6937 ; CHECK-GI-NEXT: mov v0.b[4], w4
6938 ; CHECK-GI-NEXT: mov v2.b[3], w9
6939 ; CHECK-GI-NEXT: ldr w9, [sp, #1000]
6940 ; CHECK-GI-NEXT: mov v3.b[3], w8
6941 ; CHECK-GI-NEXT: mov v4.b[3], w10
6942 ; CHECK-GI-NEXT: ldr w8, [sp, #112]
6943 ; CHECK-GI-NEXT: ldr w10, [sp, #240]
6944 ; CHECK-GI-NEXT: mov v5.b[3], w9
6945 ; CHECK-GI-NEXT: ldr w9, [sp, #880]
6946 ; CHECK-GI-NEXT: mov v1.b[4], w8
6947 ; CHECK-GI-NEXT: ldr w8, [sp, #752]
6948 ; CHECK-GI-NEXT: mov v0.b[5], w5
6949 ; CHECK-GI-NEXT: mov v2.b[4], w10
6950 ; CHECK-GI-NEXT: ldr w10, [sp, #1008]
6951 ; CHECK-GI-NEXT: mov v6.s[3], wzr
6952 ; CHECK-GI-NEXT: mov v3.b[4], w8
6953 ; CHECK-GI-NEXT: mov v4.b[4], w9
6954 ; CHECK-GI-NEXT: ldr w8, [sp, #120]
6955 ; CHECK-GI-NEXT: ldr w9, [sp, #248]
6956 ; CHECK-GI-NEXT: mov v5.b[4], w10
6957 ; CHECK-GI-NEXT: ldr w10, [sp, #888]
6958 ; CHECK-GI-NEXT: mov v1.b[5], w8
6959 ; CHECK-GI-NEXT: ldr w8, [sp, #760]
6960 ; CHECK-GI-NEXT: mov v0.b[6], w6
6961 ; CHECK-GI-NEXT: mov v2.b[5], w9
6962 ; CHECK-GI-NEXT: ldr w9, [sp, #1016]
6963 ; CHECK-GI-NEXT: mov v3.b[5], w8
6964 ; CHECK-GI-NEXT: mov v4.b[5], w10
6965 ; CHECK-GI-NEXT: ldr w8, [sp, #128]
6966 ; CHECK-GI-NEXT: ldr w10, [sp, #256]
6967 ; CHECK-GI-NEXT: mov v5.b[5], w9
6968 ; CHECK-GI-NEXT: ldr w9, [sp, #896]
6969 ; CHECK-GI-NEXT: mov v1.b[6], w8
6970 ; CHECK-GI-NEXT: ldr w8, [sp, #768]
6971 ; CHECK-GI-NEXT: mov v0.b[7], w7
6972 ; CHECK-GI-NEXT: mov v2.b[6], w10
6973 ; CHECK-GI-NEXT: ldr w10, [sp, #1024]
6974 ; CHECK-GI-NEXT: mov v3.b[6], w8
6975 ; CHECK-GI-NEXT: ldr w8, [sp, #136]
6976 ; CHECK-GI-NEXT: mov v4.b[6], w9
6977 ; CHECK-GI-NEXT: ldr w9, [sp, #264]
6978 ; CHECK-GI-NEXT: mov v5.b[6], w10
6979 ; CHECK-GI-NEXT: ldr w10, [sp, #904]
6980 ; CHECK-GI-NEXT: mov v1.b[7], w8
6981 ; CHECK-GI-NEXT: ldr w8, [sp, #776]
6982 ; CHECK-GI-NEXT: mov v0.b[8], w11
6983 ; CHECK-GI-NEXT: mov v2.b[7], w9
6984 ; CHECK-GI-NEXT: ldr w9, [sp, #1032]
6985 ; CHECK-GI-NEXT: ldr w11, [sp, #24]
6986 ; CHECK-GI-NEXT: mov v3.b[7], w8
6987 ; CHECK-GI-NEXT: ldr w8, [sp, #144]
6988 ; CHECK-GI-NEXT: mov v4.b[7], w10
6989 ; CHECK-GI-NEXT: ldr w10, [sp, #272]
6990 ; CHECK-GI-NEXT: mov v5.b[7], w9
6991 ; CHECK-GI-NEXT: ldr w9, [sp, #912]
6992 ; CHECK-GI-NEXT: mov v1.b[8], w8
6993 ; CHECK-GI-NEXT: ldr w8, [sp, #784]
6994 ; CHECK-GI-NEXT: mov v0.b[9], w11
6995 ; CHECK-GI-NEXT: mov v2.b[8], w10
6996 ; CHECK-GI-NEXT: ldr w10, [sp, #1040]
6997 ; CHECK-GI-NEXT: ldr w11, [sp, #32]
6998 ; CHECK-GI-NEXT: mov v3.b[8], w8
6999 ; CHECK-GI-NEXT: ldr w8, [sp, #152]
7000 ; CHECK-GI-NEXT: mov v4.b[8], w9
7001 ; CHECK-GI-NEXT: ldr w9, [sp, #280]
7002 ; CHECK-GI-NEXT: mov v5.b[8], w10
7003 ; CHECK-GI-NEXT: ldr w10, [sp, #920]
7004 ; CHECK-GI-NEXT: mov v1.b[9], w8
7005 ; CHECK-GI-NEXT: ldr w8, [sp, #792]
7006 ; CHECK-GI-NEXT: mov v0.b[10], w11
7007 ; CHECK-GI-NEXT: mov v2.b[9], w9
7008 ; CHECK-GI-NEXT: ldr w9, [sp, #1048]
7009 ; CHECK-GI-NEXT: ldr w11, [sp, #40]
7010 ; CHECK-GI-NEXT: mov v3.b[9], w8
7011 ; CHECK-GI-NEXT: ldr w8, [sp, #160]
7012 ; CHECK-GI-NEXT: mov v4.b[9], w10
7013 ; CHECK-GI-NEXT: ldr w10, [sp, #288]
7014 ; CHECK-GI-NEXT: mov v5.b[9], w9
7015 ; CHECK-GI-NEXT: ldr w9, [sp, #928]
7016 ; CHECK-GI-NEXT: mov v1.b[10], w8
7017 ; CHECK-GI-NEXT: ldr w8, [sp, #800]
7018 ; CHECK-GI-NEXT: mov v0.b[11], w11
7019 ; CHECK-GI-NEXT: mov v2.b[10], w10
7020 ; CHECK-GI-NEXT: ldr w10, [sp, #1056]
7021 ; CHECK-GI-NEXT: ldr w11, [sp, #48]
7022 ; CHECK-GI-NEXT: mov v3.b[10], w8
7023 ; CHECK-GI-NEXT: ldr w8, [sp, #168]
7024 ; CHECK-GI-NEXT: mov v4.b[10], w9
7025 ; CHECK-GI-NEXT: ldr w9, [sp, #296]
7026 ; CHECK-GI-NEXT: mov v5.b[10], w10
7027 ; CHECK-GI-NEXT: ldr w10, [sp, #936]
7028 ; CHECK-GI-NEXT: mov v1.b[11], w8
7029 ; CHECK-GI-NEXT: ldr w8, [sp, #808]
7030 ; CHECK-GI-NEXT: mov v0.b[12], w11
7031 ; CHECK-GI-NEXT: mov v2.b[11], w9
7032 ; CHECK-GI-NEXT: ldr w9, [sp, #1064]
7033 ; CHECK-GI-NEXT: ldr w11, [sp, #56]
7034 ; CHECK-GI-NEXT: mov v3.b[11], w8
7035 ; CHECK-GI-NEXT: ldr w8, [sp, #176]
7036 ; CHECK-GI-NEXT: mov v4.b[11], w10
7037 ; CHECK-GI-NEXT: ldr w10, [sp, #304]
7038 ; CHECK-GI-NEXT: mov v5.b[11], w9
7039 ; CHECK-GI-NEXT: ldr w9, [sp, #944]
7040 ; CHECK-GI-NEXT: mov v1.b[12], w8
7041 ; CHECK-GI-NEXT: ldr w8, [sp, #816]
7042 ; CHECK-GI-NEXT: mov v0.b[13], w11
7043 ; CHECK-GI-NEXT: mov v2.b[12], w10
7044 ; CHECK-GI-NEXT: ldr w10, [sp, #1072]
7045 ; CHECK-GI-NEXT: ldr w11, [sp, #64]
7046 ; CHECK-GI-NEXT: mov v3.b[12], w8
7047 ; CHECK-GI-NEXT: ldr w8, [sp, #184]
7048 ; CHECK-GI-NEXT: mov v4.b[12], w9
7049 ; CHECK-GI-NEXT: ldr w9, [sp, #312]
7050 ; CHECK-GI-NEXT: mov v5.b[12], w10
7051 ; CHECK-GI-NEXT: ldr w10, [sp, #952]
7052 ; CHECK-GI-NEXT: mov v1.b[13], w8
7053 ; CHECK-GI-NEXT: ldr w8, [sp, #824]
7054 ; CHECK-GI-NEXT: mov v0.b[14], w11
7055 ; CHECK-GI-NEXT: mov v2.b[13], w9
7056 ; CHECK-GI-NEXT: ldr w9, [sp, #1080]
7057 ; CHECK-GI-NEXT: ldr w11, [sp, #72]
7058 ; CHECK-GI-NEXT: mov v3.b[13], w8
7059 ; CHECK-GI-NEXT: ldr w8, [sp, #192]
7060 ; CHECK-GI-NEXT: mov v4.b[13], w10
7061 ; CHECK-GI-NEXT: ldr w10, [sp, #320]
7062 ; CHECK-GI-NEXT: mov v5.b[13], w9
7063 ; CHECK-GI-NEXT: ldr w9, [sp, #960]
7064 ; CHECK-GI-NEXT: mov v1.b[14], w8
7065 ; CHECK-GI-NEXT: ldr w8, [sp, #832]
7066 ; CHECK-GI-NEXT: mov v0.b[15], w11
7067 ; CHECK-GI-NEXT: mov v2.b[14], w10
7068 ; CHECK-GI-NEXT: ldr w10, [sp, #1088]
7069 ; CHECK-GI-NEXT: ldr w11, [sp, #968]
7070 ; CHECK-GI-NEXT: mov v3.b[14], w8
7071 ; CHECK-GI-NEXT: mov v4.b[14], w9
7072 ; CHECK-GI-NEXT: ldr w8, [sp, #200]
7073 ; CHECK-GI-NEXT: mov v5.b[14], w10
7074 ; CHECK-GI-NEXT: ldr w9, [sp, #328]
7075 ; CHECK-GI-NEXT: ldr w10, [sp, #840]
7076 ; CHECK-GI-NEXT: mov v1.b[15], w8
7077 ; CHECK-GI-NEXT: ldr w8, [sp, #1096]
7078 ; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v7.16b
7079 ; CHECK-GI-NEXT: mov v2.b[15], w9
7080 ; CHECK-GI-NEXT: mov v3.b[15], w10
7081 ; CHECK-GI-NEXT: mov v4.b[15], w11
7082 ; CHECK-GI-NEXT: mov v5.b[15], w8
7083 ; CHECK-GI-NEXT: sdot v17.4s, v1.16b, v7.16b
7084 ; CHECK-GI-NEXT: sdot v18.4s, v2.16b, v7.16b
7085 ; CHECK-GI-NEXT: sdot v19.4s, v3.16b, v7.16b
7086 ; CHECK-GI-NEXT: sdot v21.4s, v4.16b, v7.16b
7087 ; CHECK-GI-NEXT: sdot v20.4s, v5.16b, v7.16b
7088 ; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
7089 ; CHECK-GI-NEXT: add v1.4s, v18.4s, v6.4s
7090 ; CHECK-GI-NEXT: add v2.4s, v19.4s, v21.4s
7091 ; CHECK-GI-NEXT: add v3.4s, v20.4s, v6.4s
7092 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7093 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
7094 ; CHECK-GI-NEXT: addv s0, v0.4s
7095 ; CHECK-GI-NEXT: addv s1, v1.4s
7096 ; CHECK-GI-NEXT: fmov w8, s0
7097 ; CHECK-GI-NEXT: fmov w9, s1
7098 ; CHECK-GI-NEXT: add w0, w8, w9
7099 ; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
7100 ; CHECK-GI-NEXT: ret
7102 %az = sext <48 x i8> %a to <48 x i32>
7103 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %az)
7104 %cz = sext <48 x i8> %c to <48 x i32>
7105 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %cz)
7106 %x = add i32 %r1, %r2
7110 define i32 @test_udot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
7111 ; CHECK-SD-LABEL: test_udot_v64i8:
7112 ; CHECK-SD: // %bb.0: // %entry
7113 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
7114 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
7115 ; CHECK-SD-NEXT: ldp q2, q3, [x0, #32]
7116 ; CHECK-SD-NEXT: ldp q4, q5, [x1, #32]
7117 ; CHECK-SD-NEXT: udot v1.4s, v5.16b, v3.16b
7118 ; CHECK-SD-NEXT: udot v0.4s, v4.16b, v2.16b
7119 ; CHECK-SD-NEXT: ldp q2, q3, [x0]
7120 ; CHECK-SD-NEXT: ldp q4, q5, [x1]
7121 ; CHECK-SD-NEXT: udot v1.4s, v5.16b, v3.16b
7122 ; CHECK-SD-NEXT: udot v0.4s, v4.16b, v2.16b
7123 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
7124 ; CHECK-SD-NEXT: addv s0, v0.4s
7125 ; CHECK-SD-NEXT: fmov w8, s0
7126 ; CHECK-SD-NEXT: add w0, w8, w2
7127 ; CHECK-SD-NEXT: ret
7129 ; CHECK-GI-LABEL: test_udot_v64i8:
7130 ; CHECK-GI: // %bb.0: // %entry
7131 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
7132 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
7133 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
7134 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
7135 ; CHECK-GI-NEXT: ldp q1, q2, [x0]
7136 ; CHECK-GI-NEXT: ldp q6, q7, [x0, #32]
7137 ; CHECK-GI-NEXT: ldp q16, q17, [x1]
7138 ; CHECK-GI-NEXT: ldp q18, q19, [x1, #32]
7139 ; CHECK-GI-NEXT: udot v0.4s, v16.16b, v1.16b
7140 ; CHECK-GI-NEXT: udot v4.4s, v17.16b, v2.16b
7141 ; CHECK-GI-NEXT: udot v5.4s, v18.16b, v6.16b
7142 ; CHECK-GI-NEXT: udot v3.4s, v19.16b, v7.16b
7143 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s
7144 ; CHECK-GI-NEXT: add v1.4s, v5.4s, v3.4s
7145 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7146 ; CHECK-GI-NEXT: addv s0, v0.4s
7147 ; CHECK-GI-NEXT: fmov w8, s0
7148 ; CHECK-GI-NEXT: add w0, w8, w2
7149 ; CHECK-GI-NEXT: ret
7151 %0 = load <64 x i8>, ptr %a
7152 %1 = zext <64 x i8> %0 to <64 x i32>
7153 %2 = load <64 x i8>, ptr %b
7154 %3 = zext <64 x i8> %2 to <64 x i32>
7155 %4 = mul nuw nsw <64 x i32> %3, %1
7156 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
7157 %op.extra = add i32 %5, %sum
7161 define i32 @test_udot_v64i8_nomla(ptr nocapture readonly %a1) {
7162 ; CHECK-SD-LABEL: test_udot_v64i8_nomla:
7163 ; CHECK-SD: // %bb.0: // %entry
7164 ; CHECK-SD-NEXT: movi v0.16b, #1
7165 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
7166 ; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
7167 ; CHECK-SD-NEXT: ldp q3, q4, [x0, #32]
7168 ; CHECK-SD-NEXT: udot v2.4s, v4.16b, v0.16b
7169 ; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
7170 ; CHECK-SD-NEXT: ldp q3, q4, [x0]
7171 ; CHECK-SD-NEXT: udot v2.4s, v4.16b, v0.16b
7172 ; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
7173 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v2.4s
7174 ; CHECK-SD-NEXT: addv s0, v0.4s
7175 ; CHECK-SD-NEXT: fmov w0, s0
7176 ; CHECK-SD-NEXT: ret
7178 ; CHECK-GI-LABEL: test_udot_v64i8_nomla:
7179 ; CHECK-GI: // %bb.0: // %entry
7180 ; CHECK-GI-NEXT: movi v0.16b, #1
7181 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
7182 ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
7183 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
7184 ; CHECK-GI-NEXT: ldp q5, q6, [x0]
7185 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
7186 ; CHECK-GI-NEXT: ldp q7, q16, [x0, #32]
7187 ; CHECK-GI-NEXT: udot v1.4s, v5.16b, v0.16b
7188 ; CHECK-GI-NEXT: udot v3.4s, v6.16b, v0.16b
7189 ; CHECK-GI-NEXT: udot v2.4s, v16.16b, v0.16b
7190 ; CHECK-GI-NEXT: udot v4.4s, v7.16b, v0.16b
7191 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v3.4s
7192 ; CHECK-GI-NEXT: add v1.4s, v4.4s, v2.4s
7193 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7194 ; CHECK-GI-NEXT: addv s0, v0.4s
7195 ; CHECK-GI-NEXT: fmov w0, s0
7196 ; CHECK-GI-NEXT: ret
7198 %0 = load <64 x i8>, ptr %a1
7199 %1 = zext <64 x i8> %0 to <64 x i32>
7200 %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
7203 define i32 @test_sdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
7204 ; CHECK-SD-LABEL: test_sdot_v64i8:
7205 ; CHECK-SD: // %bb.0: // %entry
7206 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
7207 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
7208 ; CHECK-SD-NEXT: ldp q2, q3, [x0, #32]
7209 ; CHECK-SD-NEXT: ldp q4, q5, [x1, #32]
7210 ; CHECK-SD-NEXT: sdot v1.4s, v5.16b, v3.16b
7211 ; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v2.16b
7212 ; CHECK-SD-NEXT: ldp q2, q3, [x0]
7213 ; CHECK-SD-NEXT: ldp q4, q5, [x1]
7214 ; CHECK-SD-NEXT: sdot v1.4s, v5.16b, v3.16b
7215 ; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v2.16b
7216 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
7217 ; CHECK-SD-NEXT: addv s0, v0.4s
7218 ; CHECK-SD-NEXT: fmov w8, s0
7219 ; CHECK-SD-NEXT: add w0, w8, w2
7220 ; CHECK-SD-NEXT: ret
7222 ; CHECK-GI-LABEL: test_sdot_v64i8:
7223 ; CHECK-GI: // %bb.0: // %entry
7224 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
7225 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
7226 ; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
7227 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
7228 ; CHECK-GI-NEXT: ldp q1, q2, [x0]
7229 ; CHECK-GI-NEXT: ldp q6, q7, [x0, #32]
7230 ; CHECK-GI-NEXT: ldp q16, q17, [x1]
7231 ; CHECK-GI-NEXT: ldp q18, q19, [x1, #32]
7232 ; CHECK-GI-NEXT: sdot v0.4s, v16.16b, v1.16b
7233 ; CHECK-GI-NEXT: sdot v4.4s, v17.16b, v2.16b
7234 ; CHECK-GI-NEXT: sdot v5.4s, v18.16b, v6.16b
7235 ; CHECK-GI-NEXT: sdot v3.4s, v19.16b, v7.16b
7236 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s
7237 ; CHECK-GI-NEXT: add v1.4s, v5.4s, v3.4s
7238 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7239 ; CHECK-GI-NEXT: addv s0, v0.4s
7240 ; CHECK-GI-NEXT: fmov w8, s0
7241 ; CHECK-GI-NEXT: add w0, w8, w2
7242 ; CHECK-GI-NEXT: ret
7244 %0 = load <64 x i8>, ptr %a
7245 %1 = sext <64 x i8> %0 to <64 x i32>
7246 %2 = load <64 x i8>, ptr %b
7247 %3 = sext <64 x i8> %2 to <64 x i32>
7248 %4 = mul nsw <64 x i32> %3, %1
7249 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
7250 %op.extra = add nsw i32 %5, %sum
7254 define i32 @test_sdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
7255 ; CHECK-SD-LABEL: test_sdot_v64i8_double:
7256 ; CHECK-SD: // %bb.0: // %entry
7257 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
7258 ; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
7259 ; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
7260 ; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
7261 ; CHECK-SD-NEXT: ldp q20, q21, [sp, #96]
7262 ; CHECK-SD-NEXT: ldp q22, q23, [sp, #32]
7263 ; CHECK-SD-NEXT: sdot v16.4s, v3.16b, v7.16b
7264 ; CHECK-SD-NEXT: sdot v18.4s, v2.16b, v6.16b
7265 ; CHECK-SD-NEXT: sdot v19.4s, v23.16b, v21.16b
7266 ; CHECK-SD-NEXT: sdot v17.4s, v22.16b, v20.16b
7267 ; CHECK-SD-NEXT: ldp q2, q3, [sp, #64]
7268 ; CHECK-SD-NEXT: ldp q6, q7, [sp]
7269 ; CHECK-SD-NEXT: sdot v16.4s, v1.16b, v5.16b
7270 ; CHECK-SD-NEXT: sdot v18.4s, v0.16b, v4.16b
7271 ; CHECK-SD-NEXT: sdot v19.4s, v7.16b, v3.16b
7272 ; CHECK-SD-NEXT: sdot v17.4s, v6.16b, v2.16b
7273 ; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
7274 ; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
7275 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
7276 ; CHECK-SD-NEXT: addv s0, v0.4s
7277 ; CHECK-SD-NEXT: fmov w0, s0
7278 ; CHECK-SD-NEXT: ret
7280 ; CHECK-GI-LABEL: test_sdot_v64i8_double:
7281 ; CHECK-GI: // %bb.0: // %entry
7282 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
7283 ; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
7284 ; CHECK-GI-NEXT: movi v22.2d, #0000000000000000
7285 ; CHECK-GI-NEXT: movi v23.2d, #0000000000000000
7286 ; CHECK-GI-NEXT: ldp q16, q17, [sp]
7287 ; CHECK-GI-NEXT: movi v24.2d, #0000000000000000
7288 ; CHECK-GI-NEXT: movi v25.2d, #0000000000000000
7289 ; CHECK-GI-NEXT: movi v26.2d, #0000000000000000
7290 ; CHECK-GI-NEXT: movi v27.2d, #0000000000000000
7291 ; CHECK-GI-NEXT: ldp q19, q20, [sp, #32]
7292 ; CHECK-GI-NEXT: sdot v18.4s, v0.16b, v4.16b
7293 ; CHECK-GI-NEXT: ldp q0, q4, [sp, #64]
7294 ; CHECK-GI-NEXT: sdot v21.4s, v1.16b, v5.16b
7295 ; CHECK-GI-NEXT: ldp q1, q5, [sp, #96]
7296 ; CHECK-GI-NEXT: sdot v22.4s, v2.16b, v6.16b
7297 ; CHECK-GI-NEXT: sdot v23.4s, v3.16b, v7.16b
7298 ; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v0.16b
7299 ; CHECK-GI-NEXT: sdot v26.4s, v17.16b, v4.16b
7300 ; CHECK-GI-NEXT: sdot v27.4s, v19.16b, v1.16b
7301 ; CHECK-GI-NEXT: sdot v25.4s, v20.16b, v5.16b
7302 ; CHECK-GI-NEXT: add v0.4s, v18.4s, v21.4s
7303 ; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s
7304 ; CHECK-GI-NEXT: add v2.4s, v24.4s, v26.4s
7305 ; CHECK-GI-NEXT: add v3.4s, v27.4s, v25.4s
7306 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7307 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
7308 ; CHECK-GI-NEXT: addv s0, v0.4s
7309 ; CHECK-GI-NEXT: addv s1, v1.4s
7310 ; CHECK-GI-NEXT: fmov w8, s0
7311 ; CHECK-GI-NEXT: fmov w9, s1
7312 ; CHECK-GI-NEXT: add w0, w8, w9
7313 ; CHECK-GI-NEXT: ret
7315 %az = sext <64 x i8> %a to <64 x i32>
7316 %bz = sext <64 x i8> %b to <64 x i32>
7317 %m1 = mul nuw nsw <64 x i32> %az, %bz
7318 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
7319 %cz = sext <64 x i8> %c to <64 x i32>
7320 %dz = sext <64 x i8> %d to <64 x i32>
7321 %m2 = mul nuw nsw <64 x i32> %cz, %dz
7322 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
7323 %x = add i32 %r1, %r2
7327 define i32 @test_sdot_v64i8_double_nomla(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
7328 ; CHECK-SD-LABEL: test_sdot_v64i8_double_nomla:
7329 ; CHECK-SD: // %bb.0: // %entry
7330 ; CHECK-SD-NEXT: movi v4.16b, #1
7331 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
7332 ; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
7333 ; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
7334 ; CHECK-SD-NEXT: ldp q17, q18, [sp, #32]
7335 ; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
7336 ; CHECK-SD-NEXT: sdot v5.4s, v3.16b, v4.16b
7337 ; CHECK-SD-NEXT: sdot v6.4s, v17.16b, v4.16b
7338 ; CHECK-SD-NEXT: sdot v7.4s, v2.16b, v4.16b
7339 ; CHECK-SD-NEXT: ldp q2, q3, [sp]
7340 ; CHECK-SD-NEXT: sdot v16.4s, v18.16b, v4.16b
7341 ; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v4.16b
7342 ; CHECK-SD-NEXT: sdot v6.4s, v2.16b, v4.16b
7343 ; CHECK-SD-NEXT: sdot v7.4s, v0.16b, v4.16b
7344 ; CHECK-SD-NEXT: sdot v16.4s, v3.16b, v4.16b
7345 ; CHECK-SD-NEXT: add v0.4s, v7.4s, v5.4s
7346 ; CHECK-SD-NEXT: add v1.4s, v6.4s, v16.4s
7347 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
7348 ; CHECK-SD-NEXT: addv s0, v0.4s
7349 ; CHECK-SD-NEXT: fmov w0, s0
7350 ; CHECK-SD-NEXT: ret
7352 ; CHECK-GI-LABEL: test_sdot_v64i8_double_nomla:
7353 ; CHECK-GI: // %bb.0: // %entry
7354 ; CHECK-GI-NEXT: movi v4.16b, #1
7355 ; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
7356 ; CHECK-GI-NEXT: movi v6.2d, #0000000000000000
7357 ; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
7358 ; CHECK-GI-NEXT: ldp q21, q22, [sp]
7359 ; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
7360 ; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
7361 ; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
7362 ; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
7363 ; CHECK-GI-NEXT: movi v20.2d, #0000000000000000
7364 ; CHECK-GI-NEXT: sdot v5.4s, v0.16b, v4.16b
7365 ; CHECK-GI-NEXT: sdot v6.4s, v1.16b, v4.16b
7366 ; CHECK-GI-NEXT: ldp q0, q1, [sp, #32]
7367 ; CHECK-GI-NEXT: sdot v7.4s, v2.16b, v4.16b
7368 ; CHECK-GI-NEXT: sdot v16.4s, v3.16b, v4.16b
7369 ; CHECK-GI-NEXT: sdot v17.4s, v21.16b, v4.16b
7370 ; CHECK-GI-NEXT: sdot v19.4s, v22.16b, v4.16b
7371 ; CHECK-GI-NEXT: sdot v20.4s, v0.16b, v4.16b
7372 ; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v4.16b
7373 ; CHECK-GI-NEXT: add v0.4s, v5.4s, v6.4s
7374 ; CHECK-GI-NEXT: add v1.4s, v7.4s, v16.4s
7375 ; CHECK-GI-NEXT: add v2.4s, v17.4s, v19.4s
7376 ; CHECK-GI-NEXT: add v3.4s, v20.4s, v18.4s
7377 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7378 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
7379 ; CHECK-GI-NEXT: addv s0, v0.4s
7380 ; CHECK-GI-NEXT: addv s1, v1.4s
7381 ; CHECK-GI-NEXT: fmov w8, s0
7382 ; CHECK-GI-NEXT: fmov w9, s1
7383 ; CHECK-GI-NEXT: add w0, w8, w9
7384 ; CHECK-GI-NEXT: ret
7386 %az = sext <64 x i8> %a to <64 x i32>
7387 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %az)
7388 %cz = sext <64 x i8> %c to <64 x i32>
7389 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %cz)
7390 %x = add i32 %r1, %r2
7394 define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
7395 ; CHECK-SD-LABEL: test_usdot_v64i8:
7396 ; CHECK-SD: // %bb.0: // %entry
7397 ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
7398 ; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
7399 ; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
7400 ; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
7401 ; CHECK-SD-NEXT: ldp q1, q2, [x0, #32]
7402 ; CHECK-SD-NEXT: ldp q6, q7, [x1, #32]
7403 ; CHECK-SD-NEXT: ldp q16, q17, [x0]
7404 ; CHECK-SD-NEXT: ldp q18, q19, [x1]
7405 ; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b
7406 ; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b
7407 ; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b
7408 ; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b
7409 ; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s
7410 ; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s
7411 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
7412 ; CHECK-SD-NEXT: addv s0, v0.4s
7413 ; CHECK-SD-NEXT: fmov w8, s0
7414 ; CHECK-SD-NEXT: add w0, w8, w2
7415 ; CHECK-SD-NEXT: ret
7417 ; CHECK-GI-LABEL: test_usdot_v64i8:
7418 ; CHECK-GI: // %bb.0: // %entry
7419 ; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
7420 ; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
7421 ; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
7422 ; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
7423 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
7424 ; CHECK-GI-NEXT: .cfi_offset b8, -8
7425 ; CHECK-GI-NEXT: .cfi_offset b9, -16
7426 ; CHECK-GI-NEXT: .cfi_offset b10, -24
7427 ; CHECK-GI-NEXT: .cfi_offset b11, -32
7428 ; CHECK-GI-NEXT: .cfi_offset b12, -40
7429 ; CHECK-GI-NEXT: .cfi_offset b13, -48
7430 ; CHECK-GI-NEXT: .cfi_offset b14, -56
7431 ; CHECK-GI-NEXT: .cfi_offset b15, -64
7432 ; CHECK-GI-NEXT: ldp q0, q1, [x1]
7433 ; CHECK-GI-NEXT: ldp q21, q17, [x0]
7434 ; CHECK-GI-NEXT: ldp q3, q19, [x1, #32]
7435 ; CHECK-GI-NEXT: ldp q18, q4, [x0, #32]
7436 ; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
7437 ; CHECK-GI-NEXT: sshll2 v5.8h, v0.16b, #0
7438 ; CHECK-GI-NEXT: sshll v7.8h, v1.8b, #0
7439 ; CHECK-GI-NEXT: sshll2 v22.8h, v1.16b, #0
7440 ; CHECK-GI-NEXT: sshll v23.8h, v3.8b, #0
7441 ; CHECK-GI-NEXT: sshll2 v24.8h, v3.16b, #0
7442 ; CHECK-GI-NEXT: sshll v25.8h, v19.8b, #0
7443 ; CHECK-GI-NEXT: sshll2 v26.8h, v19.16b, #0
7444 ; CHECK-GI-NEXT: ushll v27.8h, v21.8b, #0
7445 ; CHECK-GI-NEXT: ushll2 v28.8h, v21.16b, #0
7446 ; CHECK-GI-NEXT: ushll v30.8h, v17.8b, #0
7447 ; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0
7448 ; CHECK-GI-NEXT: ushll v8.8h, v18.8b, #0
7449 ; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0
7450 ; CHECK-GI-NEXT: ushll v9.8h, v4.8b, #0
7451 ; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
7452 ; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0
7453 ; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0
7454 ; CHECK-GI-NEXT: sshll v1.4s, v5.4h, #0
7455 ; CHECK-GI-NEXT: sshll2 v16.4s, v5.8h, #0
7456 ; CHECK-GI-NEXT: sshll v2.4s, v7.4h, #0
7457 ; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0
7458 ; CHECK-GI-NEXT: sshll v3.4s, v22.4h, #0
7459 ; CHECK-GI-NEXT: sshll2 v22.4s, v22.8h, #0
7460 ; CHECK-GI-NEXT: sshll v5.4s, v23.4h, #0
7461 ; CHECK-GI-NEXT: sshll2 v23.4s, v23.8h, #0
7462 ; CHECK-GI-NEXT: sshll v7.4s, v24.4h, #0
7463 ; CHECK-GI-NEXT: sshll2 v24.4s, v24.8h, #0
7464 ; CHECK-GI-NEXT: sshll v19.4s, v25.4h, #0
7465 ; CHECK-GI-NEXT: sshll2 v25.4s, v25.8h, #0
7466 ; CHECK-GI-NEXT: sshll v21.4s, v26.4h, #0
7467 ; CHECK-GI-NEXT: sshll2 v26.4s, v26.8h, #0
7468 ; CHECK-GI-NEXT: ushll v29.4s, v27.4h, #0
7469 ; CHECK-GI-NEXT: ushll2 v27.4s, v27.8h, #0
7470 ; CHECK-GI-NEXT: ushll v31.4s, v28.4h, #0
7471 ; CHECK-GI-NEXT: ushll2 v28.4s, v28.8h, #0
7472 ; CHECK-GI-NEXT: ushll v10.4s, v30.4h, #0
7473 ; CHECK-GI-NEXT: ushll2 v30.4s, v30.8h, #0
7474 ; CHECK-GI-NEXT: ushll v11.4s, v17.4h, #0
7475 ; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0
7476 ; CHECK-GI-NEXT: ushll2 v12.4s, v8.8h, #0
7477 ; CHECK-GI-NEXT: ushll2 v13.4s, v18.8h, #0
7478 ; CHECK-GI-NEXT: ushll2 v14.4s, v9.8h, #0
7479 ; CHECK-GI-NEXT: ushll2 v15.4s, v4.8h, #0
7480 ; CHECK-GI-NEXT: mul v6.4s, v6.4s, v27.4s
7481 ; CHECK-GI-NEXT: mul v16.4s, v16.4s, v28.4s
7482 ; CHECK-GI-NEXT: mul v20.4s, v20.4s, v30.4s
7483 ; CHECK-GI-NEXT: mul v17.4s, v22.4s, v17.4s
7484 ; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0
7485 ; CHECK-GI-NEXT: mul v22.4s, v23.4s, v12.4s
7486 ; CHECK-GI-NEXT: mul v23.4s, v24.4s, v13.4s
7487 ; CHECK-GI-NEXT: mul v24.4s, v25.4s, v14.4s
7488 ; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
7489 ; CHECK-GI-NEXT: mul v25.4s, v26.4s, v15.4s
7490 ; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
7491 ; CHECK-GI-NEXT: ushll v26.4s, v9.4h, #0
7492 ; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
7493 ; CHECK-GI-NEXT: mla v6.4s, v0.4s, v29.4s
7494 ; CHECK-GI-NEXT: mla v16.4s, v1.4s, v31.4s
7495 ; CHECK-GI-NEXT: mla v20.4s, v2.4s, v10.4s
7496 ; CHECK-GI-NEXT: mla v17.4s, v3.4s, v11.4s
7497 ; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
7498 ; CHECK-GI-NEXT: mla v22.4s, v5.4s, v8.4s
7499 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
7500 ; CHECK-GI-NEXT: mla v23.4s, v7.4s, v18.4s
7501 ; CHECK-GI-NEXT: mla v24.4s, v19.4s, v26.4s
7502 ; CHECK-GI-NEXT: mla v25.4s, v21.4s, v4.4s
7503 ; CHECK-GI-NEXT: add v0.4s, v6.4s, v16.4s
7504 ; CHECK-GI-NEXT: add v1.4s, v20.4s, v17.4s
7505 ; CHECK-GI-NEXT: add v2.4s, v22.4s, v23.4s
7506 ; CHECK-GI-NEXT: add v3.4s, v24.4s, v25.4s
7507 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7508 ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
7509 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
7510 ; CHECK-GI-NEXT: addv s0, v0.4s
7511 ; CHECK-GI-NEXT: fmov w8, s0
7512 ; CHECK-GI-NEXT: add w0, w8, w2
7513 ; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
7514 ; CHECK-GI-NEXT: ret
7516 %0 = load <64 x i8>, ptr %a
7517 %1 = zext <64 x i8> %0 to <64 x i32>
7518 %2 = load <64 x i8>, ptr %b
7519 %3 = sext <64 x i8> %2 to <64 x i32>
7520 %4 = mul nsw <64 x i32> %3, %1
7521 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
7522 %op.extra = add nsw i32 %5, %sum
7526 define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
7527 ; CHECK-SD-LABEL: test_usdot_v64i8_double:
7528 ; CHECK-SD: // %bb.0: // %entry
7529 ; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
7530 ; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
7531 ; CHECK-SD-NEXT: movi v22.2d, #0000000000000000
7532 ; CHECK-SD-NEXT: movi v23.2d, #0000000000000000
7533 ; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
7534 ; CHECK-SD-NEXT: movi v24.2d, #0000000000000000
7535 ; CHECK-SD-NEXT: movi v25.2d, #0000000000000000
7536 ; CHECK-SD-NEXT: movi v26.2d, #0000000000000000
7537 ; CHECK-SD-NEXT: movi v27.2d, #0000000000000000
7538 ; CHECK-SD-NEXT: ldp q19, q20, [sp, #96]
7539 ; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b
7540 ; CHECK-SD-NEXT: ldp q3, q7, [sp, #32]
7541 ; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b
7542 ; CHECK-SD-NEXT: ldp q1, q5, [sp]
7543 ; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b
7544 ; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b
7545 ; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b
7546 ; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b
7547 ; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b
7548 ; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b
7549 ; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s
7550 ; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s
7551 ; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s
7552 ; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s
7553 ; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
7554 ; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
7555 ; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
7556 ; CHECK-SD-NEXT: addv s0, v0.4s
7557 ; CHECK-SD-NEXT: fmov w0, s0
7558 ; CHECK-SD-NEXT: ret
7560 ; CHECK-GI-LABEL: test_usdot_v64i8_double:
7561 ; CHECK-GI: // %bb.0: // %entry
7562 ; CHECK-GI-NEXT: sub sp, sp, #304
7563 ; CHECK-GI-NEXT: stp d15, d14, [sp, #224] // 16-byte Folded Spill
7564 ; CHECK-GI-NEXT: stp d13, d12, [sp, #240] // 16-byte Folded Spill
7565 ; CHECK-GI-NEXT: stp d11, d10, [sp, #256] // 16-byte Folded Spill
7566 ; CHECK-GI-NEXT: stp d9, d8, [sp, #272] // 16-byte Folded Spill
7567 ; CHECK-GI-NEXT: str x29, [sp, #288] // 8-byte Folded Spill
7568 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 304
7569 ; CHECK-GI-NEXT: .cfi_offset w29, -16
7570 ; CHECK-GI-NEXT: .cfi_offset b8, -24
7571 ; CHECK-GI-NEXT: .cfi_offset b9, -32
7572 ; CHECK-GI-NEXT: .cfi_offset b10, -40
7573 ; CHECK-GI-NEXT: .cfi_offset b11, -48
7574 ; CHECK-GI-NEXT: .cfi_offset b12, -56
7575 ; CHECK-GI-NEXT: .cfi_offset b13, -64
7576 ; CHECK-GI-NEXT: .cfi_offset b14, -72
7577 ; CHECK-GI-NEXT: .cfi_offset b15, -80
7578 ; CHECK-GI-NEXT: ushll v17.8h, v0.8b, #0
7579 ; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
7580 ; CHECK-GI-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload
7581 ; CHECK-GI-NEXT: mov v20.16b, v3.16b
7582 ; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0
7583 ; CHECK-GI-NEXT: ushll2 v18.8h, v1.16b, #0
7584 ; CHECK-GI-NEXT: ushll v26.8h, v2.8b, #0
7585 ; CHECK-GI-NEXT: ldp q27, q28, [sp, #304]
7586 ; CHECK-GI-NEXT: ushll2 v29.8h, v2.16b, #0
7587 ; CHECK-GI-NEXT: ushll v2.4s, v17.4h, #0
7588 ; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
7589 ; CHECK-GI-NEXT: sshll v8.8h, v4.8b, #0
7590 ; CHECK-GI-NEXT: ldp q23, q21, [sp, #368]
7591 ; CHECK-GI-NEXT: sshll2 v9.8h, v4.16b, #0
7592 ; CHECK-GI-NEXT: sshll2 v11.8h, v5.16b, #0
7593 ; CHECK-GI-NEXT: mov v25.16b, v7.16b
7594 ; CHECK-GI-NEXT: ushll2 v19.4s, v17.8h, #0
7595 ; CHECK-GI-NEXT: stp q1, q2, [sp, #192] // 32-byte Folded Spill
7596 ; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
7597 ; CHECK-GI-NEXT: ushll2 v17.4s, v18.8h, #0
7598 ; CHECK-GI-NEXT: ldp q24, q22, [sp, #336]
7599 ; CHECK-GI-NEXT: sshll v10.8h, v5.8b, #0
7600 ; CHECK-GI-NEXT: sshll v12.8h, v6.8b, #0
7601 ; CHECK-GI-NEXT: sshll2 v13.8h, v6.16b, #0
7602 ; CHECK-GI-NEXT: mov v2.16b, v20.16b
7603 ; CHECK-GI-NEXT: sshll2 v0.4s, v8.8h, #0
7604 ; CHECK-GI-NEXT: sshll2 v4.4s, v9.8h, #0
7605 ; CHECK-GI-NEXT: sshll2 v6.4s, v11.8h, #0
7606 ; CHECK-GI-NEXT: ushll2 v7.4s, v16.8h, #0
7607 ; CHECK-GI-NEXT: ushll2 v31.4s, v29.8h, #0
7608 ; CHECK-GI-NEXT: sshll2 v5.4s, v10.8h, #0
7609 ; CHECK-GI-NEXT: sshll2 v1.4s, v13.8h, #0
7610 ; CHECK-GI-NEXT: ushll2 v30.4s, v26.8h, #0
7611 ; CHECK-GI-NEXT: ushll v14.8h, v2.8b, #0
7612 ; CHECK-GI-NEXT: mul v20.4s, v19.4s, v0.4s
7613 ; CHECK-GI-NEXT: mul v19.4s, v3.4s, v4.4s
7614 ; CHECK-GI-NEXT: sshll v0.8h, v25.8b, #0
7615 ; CHECK-GI-NEXT: mul v4.4s, v17.4s, v6.4s
7616 ; CHECK-GI-NEXT: sshll2 v15.4s, v12.8h, #0
7617 ; CHECK-GI-NEXT: ldp q17, q3, [sp, #400]
7618 ; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
7619 ; CHECK-GI-NEXT: mul v7.4s, v31.4s, v1.4s
7620 ; CHECK-GI-NEXT: ushll2 v31.8h, v2.16b, #0
7621 ; CHECK-GI-NEXT: sshll2 v25.8h, v25.16b, #0
7622 ; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0
7623 ; CHECK-GI-NEXT: ushll v2.4s, v14.4h, #0
7624 ; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
7625 ; CHECK-GI-NEXT: str q3, [sp, #96] // 16-byte Folded Spill
7626 ; CHECK-GI-NEXT: ushll2 v3.4s, v14.8h, #0
7627 ; CHECK-GI-NEXT: mul v6.4s, v30.4s, v15.4s
7628 ; CHECK-GI-NEXT: str q31, [sp, #160] // 16-byte Folded Spill
7629 ; CHECK-GI-NEXT: ushll v30.4s, v26.4h, #0
7630 ; CHECK-GI-NEXT: sshll v26.4s, v8.4h, #0
7631 ; CHECK-GI-NEXT: ushll v14.8h, v27.8b, #0
7632 ; CHECK-GI-NEXT: ushll v15.4s, v29.4h, #0
7633 ; CHECK-GI-NEXT: sshll v29.4s, v9.4h, #0
7634 ; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
7635 ; CHECK-GI-NEXT: ushll2 v3.4s, v31.8h, #0
7636 ; CHECK-GI-NEXT: ushll v31.8h, v28.8b, #0
7637 ; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
7638 ; CHECK-GI-NEXT: sshll v8.4s, v10.4h, #0
7639 ; CHECK-GI-NEXT: sshll v9.4s, v11.4h, #0
7640 ; CHECK-GI-NEXT: sshll v10.4s, v12.4h, #0
7641 ; CHECK-GI-NEXT: sshll v11.4s, v13.4h, #0
7642 ; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
7643 ; CHECK-GI-NEXT: stp q3, q25, [sp, #112] // 32-byte Folded Spill
7644 ; CHECK-GI-NEXT: ldr q3, [sp, #208] // 16-byte Folded Reload
7645 ; CHECK-GI-NEXT: ushll2 v28.8h, v28.16b, #0
7646 ; CHECK-GI-NEXT: mla v1.4s, v2.4s, v0.4s
7647 ; CHECK-GI-NEXT: ushll2 v0.4s, v31.8h, #0
7648 ; CHECK-GI-NEXT: mla v5.4s, v16.4s, v8.4s
7649 ; CHECK-GI-NEXT: mla v20.4s, v3.4s, v26.4s
7650 ; CHECK-GI-NEXT: sshll2 v3.4s, v25.8h, #0
7651 ; CHECK-GI-NEXT: mla v6.4s, v30.4s, v10.4s
7652 ; CHECK-GI-NEXT: mla v7.4s, v15.4s, v11.4s
7653 ; CHECK-GI-NEXT: sshll v25.8h, v23.8b, #0
7654 ; CHECK-GI-NEXT: mla v4.4s, v18.4s, v9.4s
7655 ; CHECK-GI-NEXT: ushll v30.8h, v22.8b, #0
7656 ; CHECK-GI-NEXT: ushll2 v26.8h, v22.16b, #0
7657 ; CHECK-GI-NEXT: sshll v22.8h, v21.8b, #0
7658 ; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill
7659 ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
7660 ; CHECK-GI-NEXT: ushll2 v8.8h, v27.16b, #0
7661 ; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill
7662 ; CHECK-GI-NEXT: ldr q9, [sp, #32] // 16-byte Folded Reload
7663 ; CHECK-GI-NEXT: ushll2 v1.4s, v14.8h, #0
7664 ; CHECK-GI-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill
7665 ; CHECK-GI-NEXT: mla v19.4s, v3.4s, v29.4s
7666 ; CHECK-GI-NEXT: sshll2 v7.4s, v25.8h, #0
7667 ; CHECK-GI-NEXT: str q5, [sp, #176] // 16-byte Folded Spill
7668 ; CHECK-GI-NEXT: ushll v29.8h, v24.8b, #0
7669 ; CHECK-GI-NEXT: ushll2 v27.8h, v24.16b, #0
7670 ; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
7671 ; CHECK-GI-NEXT: ldp q0, q16, [sp, #96] // 32-byte Folded Reload
7672 ; CHECK-GI-NEXT: str q4, [sp, #144] // 16-byte Folded Spill
7673 ; CHECK-GI-NEXT: sshll2 v24.8h, v23.16b, #0
7674 ; CHECK-GI-NEXT: ushll2 v18.4s, v26.8h, #0
7675 ; CHECK-GI-NEXT: stp q19, q20, [sp, #192] // 32-byte Folded Spill
7676 ; CHECK-GI-NEXT: sshll2 v20.8h, v21.16b, #0
7677 ; CHECK-GI-NEXT: sshll v21.8h, v17.8b, #0
7678 ; CHECK-GI-NEXT: sshll2 v19.8h, v17.16b, #0
7679 ; CHECK-GI-NEXT: sshll2 v17.8h, v0.16b, #0
7680 ; CHECK-GI-NEXT: mul v16.4s, v16.4s, v9.4s
7681 ; CHECK-GI-NEXT: ldr q9, [sp, #16] // 16-byte Folded Reload
7682 ; CHECK-GI-NEXT: sshll v23.8h, v0.8b, #0
7683 ; CHECK-GI-NEXT: sshll2 v2.4s, v22.8h, #0
7684 ; CHECK-GI-NEXT: ushll2 v12.4s, v27.8h, #0
7685 ; CHECK-GI-NEXT: ushll v26.4s, v26.4h, #0
7686 ; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
7687 ; CHECK-GI-NEXT: sshll2 v0.4s, v17.8h, #0
7688 ; CHECK-GI-NEXT: mul v7.4s, v9.4s, v7.4s
7689 ; CHECK-GI-NEXT: ldr q9, [sp] // 16-byte Folded Reload
7690 ; CHECK-GI-NEXT: sshll2 v5.4s, v19.8h, #0
7691 ; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0
7692 ; CHECK-GI-NEXT: sshll2 v3.4s, v20.8h, #0
7693 ; CHECK-GI-NEXT: mul v2.4s, v9.4s, v2.4s
7694 ; CHECK-GI-NEXT: ldr q9, [sp, #128] // 16-byte Folded Reload
7695 ; CHECK-GI-NEXT: ushll2 v15.4s, v8.8h, #0
7696 ; CHECK-GI-NEXT: mul v0.4s, v18.4s, v0.4s
7697 ; CHECK-GI-NEXT: ldr q18, [sp, #160] // 16-byte Folded Reload
7698 ; CHECK-GI-NEXT: ushll2 v11.4s, v29.8h, #0
7699 ; CHECK-GI-NEXT: sshll v9.4s, v9.4h, #0
7700 ; CHECK-GI-NEXT: ushll2 v13.4s, v30.8h, #0
7701 ; CHECK-GI-NEXT: sshll2 v1.4s, v24.8h, #0
7702 ; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
7703 ; CHECK-GI-NEXT: sshll2 v4.4s, v21.8h, #0
7704 ; CHECK-GI-NEXT: sshll2 v6.4s, v23.8h, #0
7705 ; CHECK-GI-NEXT: mul v5.4s, v12.4s, v5.4s
7706 ; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
7707 ; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
7708 ; CHECK-GI-NEXT: mla v0.4s, v26.4s, v17.4s
7709 ; CHECK-GI-NEXT: mul v3.4s, v10.4s, v3.4s
7710 ; CHECK-GI-NEXT: mul v1.4s, v15.4s, v1.4s
7711 ; CHECK-GI-NEXT: mla v16.4s, v18.4s, v9.4s
7712 ; CHECK-GI-NEXT: ldp q18, q17, [sp, #192] // 32-byte Folded Reload
7713 ; CHECK-GI-NEXT: mul v4.4s, v11.4s, v4.4s
7714 ; CHECK-GI-NEXT: mul v6.4s, v13.4s, v6.4s
7715 ; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
7716 ; CHECK-GI-NEXT: ldp d13, d12, [sp, #240] // 16-byte Folded Reload
7717 ; CHECK-GI-NEXT: sshll v20.4s, v20.4h, #0
7718 ; CHECK-GI-NEXT: ushll v10.4s, v14.4h, #0
7719 ; CHECK-GI-NEXT: ldp d15, d14, [sp, #224] // 16-byte Folded Reload
7720 ; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0
7721 ; CHECK-GI-NEXT: ushll v31.4s, v31.4h, #0
7722 ; CHECK-GI-NEXT: ushll v29.4s, v29.4h, #0
7723 ; CHECK-GI-NEXT: ushll v30.4s, v30.4h, #0
7724 ; CHECK-GI-NEXT: sshll v25.4s, v25.4h, #0
7725 ; CHECK-GI-NEXT: sshll v24.4s, v24.4h, #0
7726 ; CHECK-GI-NEXT: sshll v22.4s, v22.4h, #0
7727 ; CHECK-GI-NEXT: sshll v21.4s, v21.4h, #0
7728 ; CHECK-GI-NEXT: sshll v23.4s, v23.4h, #0
7729 ; CHECK-GI-NEXT: mla v5.4s, v27.4s, v19.4s
7730 ; CHECK-GI-NEXT: ldr q19, [sp, #144] // 16-byte Folded Reload
7731 ; CHECK-GI-NEXT: add v17.4s, v17.4s, v18.4s
7732 ; CHECK-GI-NEXT: ldr q18, [sp, #176] // 16-byte Folded Reload
7733 ; CHECK-GI-NEXT: mla v3.4s, v28.4s, v20.4s
7734 ; CHECK-GI-NEXT: mla v7.4s, v10.4s, v25.4s
7735 ; CHECK-GI-NEXT: ldp d11, d10, [sp, #256] // 16-byte Folded Reload
7736 ; CHECK-GI-NEXT: mla v1.4s, v8.4s, v24.4s
7737 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #272] // 16-byte Folded Reload
7738 ; CHECK-GI-NEXT: add v18.4s, v18.4s, v19.4s
7739 ; CHECK-GI-NEXT: ldp q20, q19, [sp, #64] // 32-byte Folded Reload
7740 ; CHECK-GI-NEXT: mla v2.4s, v31.4s, v22.4s
7741 ; CHECK-GI-NEXT: mla v4.4s, v29.4s, v21.4s
7742 ; CHECK-GI-NEXT: mla v6.4s, v30.4s, v23.4s
7743 ; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s
7744 ; CHECK-GI-NEXT: add v19.4s, v19.4s, v20.4s
7745 ; CHECK-GI-NEXT: ldr q20, [sp, #48] // 16-byte Folded Reload
7746 ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
7747 ; CHECK-GI-NEXT: add v16.4s, v20.4s, v16.4s
7748 ; CHECK-GI-NEXT: add v3.4s, v4.4s, v5.4s
7749 ; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s
7750 ; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s
7751 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
7752 ; CHECK-GI-NEXT: add v5.4s, v19.4s, v16.4s
7753 ; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
7754 ; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
7755 ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
7756 ; CHECK-GI-NEXT: addv s1, v2.4s
7757 ; CHECK-GI-NEXT: addv s0, v0.4s
7758 ; CHECK-GI-NEXT: fmov w8, s1
7759 ; CHECK-GI-NEXT: fmov w9, s0
7760 ; CHECK-GI-NEXT: add w0, w8, w9
7761 ; CHECK-GI-NEXT: add sp, sp, #304
7762 ; CHECK-GI-NEXT: ret
7764 %az = zext <64 x i8> %a to <64 x i32>
7765 %bz = sext <64 x i8> %b to <64 x i32>
7766 %m1 = mul nuw nsw <64 x i32> %az, %bz
7767 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
7768 %cz = zext <64 x i8> %c to <64 x i32>
7769 %dz = sext <64 x i8> %d to <64 x i32>
7770 %m2 = mul nuw nsw <64 x i32> %cz, %dz
7771 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
7772 %x = add i32 %r1, %r2