1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
5 ; CHECK-LABEL: add_v4i32_v4i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
10 %m = mul <4 x i32> %x, %y
11 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
15 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
16 ; CHECK-LABEL: add_v4i32_v4i64_zext:
17 ; CHECK: @ %bb.0: @ %entry
18 ; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
21 %xx = zext <4 x i32> %x to <4 x i64>
22 %yy = zext <4 x i32> %y to <4 x i64>
23 %m = mul <4 x i64> %xx, %yy
24 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
28 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
29 ; CHECK-LABEL: add_v4i32_v4i64_sext:
30 ; CHECK: @ %bb.0: @ %entry
31 ; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
34 %xx = sext <4 x i32> %x to <4 x i64>
35 %yy = sext <4 x i32> %y to <4 x i64>
36 %m = mul <4 x i64> %xx, %yy
37 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
41 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
42 ; CHECK-LABEL: add_v2i32_v2i64_zext:
43 ; CHECK: @ %bb.0: @ %entry
44 ; CHECK-NEXT: vmullb.u32 q2, q0, q1
45 ; CHECK-NEXT: vmov r0, r1, d5
46 ; CHECK-NEXT: vmov r2, r3, d4
47 ; CHECK-NEXT: adds r0, r0, r2
48 ; CHECK-NEXT: adcs r1, r3
51 %xx = zext <2 x i32> %x to <2 x i64>
52 %yy = zext <2 x i32> %y to <2 x i64>
53 %m = mul <2 x i64> %xx, %yy
54 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
58 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
59 ; CHECK-LABEL: add_v2i32_v2i64_sext:
60 ; CHECK: @ %bb.0: @ %entry
61 ; CHECK-NEXT: vmullb.s32 q2, q0, q1
62 ; CHECK-NEXT: vmov r0, r1, d5
63 ; CHECK-NEXT: vmov r2, r3, d4
64 ; CHECK-NEXT: adds r0, r0, r2
65 ; CHECK-NEXT: adcs r1, r3
68 %xx = sext <2 x i32> %x to <2 x i64>
69 %yy = sext <2 x i32> %y to <2 x i64>
70 %m = mul <2 x i64> %xx, %yy
71 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
75 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
76 ; CHECK-LABEL: add_v8i16_v8i32_zext:
77 ; CHECK: @ %bb.0: @ %entry
78 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
81 %xx = zext <8 x i16> %x to <8 x i32>
82 %yy = zext <8 x i16> %y to <8 x i32>
83 %m = mul <8 x i32> %xx, %yy
84 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
88 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
89 ; CHECK-LABEL: add_v8i16_v8i32_sext:
90 ; CHECK: @ %bb.0: @ %entry
91 ; CHECK-NEXT: vmlav.s16 r0, q0, q1
94 %xx = sext <8 x i16> %x to <8 x i32>
95 %yy = sext <8 x i16> %y to <8 x i32>
96 %m = mul <8 x i32> %xx, %yy
97 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
101 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
102 ; CHECK-LABEL: add_v4i16_v4i32_zext:
103 ; CHECK: @ %bb.0: @ %entry
104 ; CHECK-NEXT: vmovlb.u16 q1, q1
105 ; CHECK-NEXT: vmovlb.u16 q0, q0
106 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
109 %xx = zext <4 x i16> %x to <4 x i32>
110 %yy = zext <4 x i16> %y to <4 x i32>
111 %m = mul <4 x i32> %xx, %yy
112 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
116 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
117 ; CHECK-LABEL: add_v4i16_v4i32_sext:
118 ; CHECK: @ %bb.0: @ %entry
119 ; CHECK-NEXT: vmovlb.s16 q1, q1
120 ; CHECK-NEXT: vmovlb.s16 q0, q0
121 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
124 %xx = sext <4 x i16> %x to <4 x i32>
125 %yy = sext <4 x i16> %y to <4 x i32>
126 %m = mul <4 x i32> %xx, %yy
127 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
131 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
132 ; CHECK-LABEL: add_v8i16_v8i16:
133 ; CHECK: @ %bb.0: @ %entry
134 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
135 ; CHECK-NEXT: uxth r0, r0
138 %m = mul <8 x i16> %x, %y
139 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
143 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
144 ; CHECK-LABEL: add_v8i16_v8i64_zext:
145 ; CHECK: @ %bb.0: @ %entry
146 ; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
149 %xx = zext <8 x i16> %x to <8 x i64>
150 %yy = zext <8 x i16> %y to <8 x i64>
151 %m = mul <8 x i64> %xx, %yy
152 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
156 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
157 ; CHECK-LABEL: add_v8i16_v8i64_sext:
158 ; CHECK: @ %bb.0: @ %entry
159 ; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
162 %xx = sext <8 x i16> %x to <8 x i64>
163 %yy = sext <8 x i16> %y to <8 x i64>
164 %m = mul <8 x i64> %xx, %yy
165 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
169 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y) {
170 ; CHECK-LABEL: add_v8i8i16_v8i64_zext:
171 ; CHECK: @ %bb.0: @ %entry
172 ; CHECK-NEXT: vmovlb.u8 q1, q1
173 ; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
176 %xx = zext <8 x i16> %x to <8 x i64>
177 %yy = zext <8 x i8> %y to <8 x i64>
178 %m = mul <8 x i64> %xx, %yy
179 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
183 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y) {
184 ; CHECK-LABEL: add_v8i8i16_v8i64_sext:
185 ; CHECK: @ %bb.0: @ %entry
186 ; CHECK-NEXT: vmovlb.s8 q1, q1
187 ; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
190 %xx = sext <8 x i16> %x to <8 x i64>
191 %yy = sext <8 x i8> %y to <8 x i64>
192 %m = mul <8 x i64> %xx, %yy
193 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
197 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
198 ; CHECK-LABEL: add_v4i16_v4i64_zext:
199 ; CHECK: @ %bb.0: @ %entry
200 ; CHECK-NEXT: vmovlb.u16 q1, q1
201 ; CHECK-NEXT: vmovlb.u16 q0, q0
202 ; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
205 %xx = zext <4 x i16> %x to <4 x i64>
206 %yy = zext <4 x i16> %y to <4 x i64>
207 %m = mul <4 x i64> %xx, %yy
208 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
212 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
213 ; CHECK-LABEL: add_v4i16_v4i64_sext:
214 ; CHECK: @ %bb.0: @ %entry
215 ; CHECK-NEXT: vmovlb.s16 q1, q1
216 ; CHECK-NEXT: vmovlb.s16 q0, q0
217 ; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
220 %xx = sext <4 x i16> %x to <4 x i64>
221 %yy = sext <4 x i16> %y to <4 x i64>
222 %m = mul <4 x i64> %xx, %yy
223 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
227 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
228 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
229 ; CHECK: @ %bb.0: @ %entry
230 ; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
233 %xx = zext <8 x i16> %x to <8 x i32>
234 %yy = zext <8 x i16> %y to <8 x i32>
235 %m = mul <8 x i32> %xx, %yy
236 %ma = zext <8 x i32> %m to <8 x i64>
237 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
241 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
242 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
243 ; CHECK: @ %bb.0: @ %entry
244 ; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
247 %xx = sext <8 x i16> %x to <8 x i32>
248 %yy = sext <8 x i16> %y to <8 x i32>
249 %m = mul <8 x i32> %xx, %yy
250 %ma = sext <8 x i32> %m to <8 x i64>
251 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
255 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y) {
256 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
257 ; CHECK: @ %bb.0: @ %entry
258 ; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q0
261 %xx = sext <8 x i16> %x to <8 x i32>
262 %m = mul <8 x i32> %xx, %xx
263 %ma = zext <8 x i32> %m to <8 x i64>
264 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
268 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
269 ; CHECK-LABEL: add_v2i16_v2i64_zext:
270 ; CHECK: @ %bb.0: @ %entry
271 ; CHECK-NEXT: vmov.i64 q2, #0xffff
272 ; CHECK-NEXT: vand q1, q1, q2
273 ; CHECK-NEXT: vand q0, q0, q2
274 ; CHECK-NEXT: vmov r0, s6
275 ; CHECK-NEXT: vmov r1, s2
276 ; CHECK-NEXT: vmov r2, s4
277 ; CHECK-NEXT: vmov r3, s0
278 ; CHECK-NEXT: umull r0, r1, r1, r0
279 ; CHECK-NEXT: umlal r0, r1, r3, r2
282 %xx = zext <2 x i16> %x to <2 x i64>
283 %yy = zext <2 x i16> %y to <2 x i64>
284 %m = mul <2 x i64> %xx, %yy
285 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
289 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
290 ; CHECK-LABEL: add_v2i16_v2i64_sext:
291 ; CHECK: @ %bb.0: @ %entry
292 ; CHECK-NEXT: vmov r0, s6
293 ; CHECK-NEXT: vmov r1, s2
294 ; CHECK-NEXT: vmov r2, s4
295 ; CHECK-NEXT: vmov r3, s0
296 ; CHECK-NEXT: sxth r0, r0
297 ; CHECK-NEXT: sxth r1, r1
298 ; CHECK-NEXT: smull r0, r1, r1, r0
299 ; CHECK-NEXT: sxth r2, r2
300 ; CHECK-NEXT: sxth r3, r3
301 ; CHECK-NEXT: smlal r0, r1, r3, r2
304 %xx = sext <2 x i16> %x to <2 x i64>
305 %yy = sext <2 x i16> %y to <2 x i64>
306 %m = mul <2 x i64> %xx, %yy
307 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
311 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
312 ; CHECK-LABEL: add_v16i8_v16i32_zext:
313 ; CHECK: @ %bb.0: @ %entry
314 ; CHECK-NEXT: vmlav.u8 r0, q0, q1
317 %xx = zext <16 x i8> %x to <16 x i32>
318 %yy = zext <16 x i8> %y to <16 x i32>
319 %m = mul <16 x i32> %xx, %yy
320 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
324 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
325 ; CHECK-LABEL: add_v16i8_v16i32_sext:
326 ; CHECK: @ %bb.0: @ %entry
327 ; CHECK-NEXT: vmlav.s8 r0, q0, q1
330 %xx = sext <16 x i8> %x to <16 x i32>
331 %yy = sext <16 x i8> %y to <16 x i32>
332 %m = mul <16 x i32> %xx, %yy
333 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
337 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
338 ; CHECK-LABEL: add_v8i8_v8i32_zext:
339 ; CHECK: @ %bb.0: @ %entry
340 ; CHECK-NEXT: vmovlb.u8 q1, q1
341 ; CHECK-NEXT: vmovlb.u8 q0, q0
342 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
345 %xx = zext <8 x i8> %x to <8 x i32>
346 %yy = zext <8 x i8> %y to <8 x i32>
347 %m = mul <8 x i32> %xx, %yy
348 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
352 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
353 ; CHECK-LABEL: add_v8i8_v8i32_sext:
354 ; CHECK: @ %bb.0: @ %entry
355 ; CHECK-NEXT: vmovlb.s8 q1, q1
356 ; CHECK-NEXT: vmovlb.s8 q0, q0
357 ; CHECK-NEXT: vmlav.s16 r0, q0, q1
360 %xx = sext <8 x i8> %x to <8 x i32>
361 %yy = sext <8 x i8> %y to <8 x i32>
362 %m = mul <8 x i32> %xx, %yy
363 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
367 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y) {
368 ; CHECK-LABEL: add_v8i8i16_v8i32_zext:
369 ; CHECK: @ %bb.0: @ %entry
370 ; CHECK-NEXT: vmovlb.u8 q0, q0
371 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
374 %xx = zext <8 x i8> %x to <8 x i32>
375 %yy = zext <8 x i16> %y to <8 x i32>
376 %m = mul <8 x i32> %xx, %yy
377 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
381 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y) {
382 ; CHECK-LABEL: add_v8i8i16_v8i32_sext:
383 ; CHECK: @ %bb.0: @ %entry
384 ; CHECK-NEXT: vmovlb.s8 q0, q0
385 ; CHECK-NEXT: vmlav.s16 r0, q0, q1
388 %xx = sext <8 x i8> %x to <8 x i32>
389 %yy = sext <8 x i16> %y to <8 x i32>
390 %m = mul <8 x i32> %xx, %yy
391 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
395 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
396 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
397 ; CHECK: @ %bb.0: @ %entry
398 ; CHECK-NEXT: vmlav.u8 r0, q0, q1
401 %xx = zext <16 x i8> %x to <16 x i16>
402 %yy = zext <16 x i8> %y to <16 x i16>
403 %m = mul <16 x i16> %xx, %yy
404 %ma = zext <16 x i16> %m to <16 x i32>
405 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
409 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
410 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
411 ; CHECK: @ %bb.0: @ %entry
412 ; CHECK-NEXT: vmlav.s8 r0, q0, q1
415 %xx = sext <16 x i8> %x to <16 x i16>
416 %yy = sext <16 x i8> %y to <16 x i16>
417 %m = mul <16 x i16> %xx, %yy
418 %ma = sext <16 x i16> %m to <16 x i32>
419 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
423 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y) {
424 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
425 ; CHECK: @ %bb.0: @ %entry
426 ; CHECK-NEXT: vmlav.s8 r0, q0, q0
429 %xx = sext <16 x i8> %x to <16 x i16>
430 %m = mul <16 x i16> %xx, %xx
431 %ma = zext <16 x i16> %m to <16 x i32>
432 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
436 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
437 ; CHECK-LABEL: add_v4i8_v4i32_zext:
438 ; CHECK: @ %bb.0: @ %entry
439 ; CHECK-NEXT: vmov.i32 q2, #0xff
440 ; CHECK-NEXT: vand q1, q1, q2
441 ; CHECK-NEXT: vand q0, q0, q2
442 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
445 %xx = zext <4 x i8> %x to <4 x i32>
446 %yy = zext <4 x i8> %y to <4 x i32>
447 %m = mul <4 x i32> %xx, %yy
448 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
452 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
453 ; CHECK-LABEL: add_v4i8_v4i32_sext:
454 ; CHECK: @ %bb.0: @ %entry
455 ; CHECK-NEXT: vmovlb.s8 q1, q1
456 ; CHECK-NEXT: vmovlb.s8 q0, q0
457 ; CHECK-NEXT: vmovlb.s16 q1, q1
458 ; CHECK-NEXT: vmovlb.s16 q0, q0
459 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
462 %xx = sext <4 x i8> %x to <4 x i32>
463 %yy = sext <4 x i8> %y to <4 x i32>
464 %m = mul <4 x i32> %xx, %yy
465 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
469 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y) {
470 ; CHECK-LABEL: add_v4i8_v4i32_szext:
471 ; CHECK: @ %bb.0: @ %entry
472 ; CHECK-NEXT: vmovlb.s8 q0, q0
473 ; CHECK-NEXT: vmov.i32 q2, #0xff
474 ; CHECK-NEXT: vand q1, q1, q2
475 ; CHECK-NEXT: vmovlb.s16 q0, q0
476 ; CHECK-NEXT: vmlav.u32 r0, q0, q1
479 %xx = sext <4 x i8> %x to <4 x i32>
480 %yy = zext <4 x i8> %y to <4 x i32>
481 %m = mul <4 x i32> %xx, %yy
482 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
486 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
487 ; CHECK-LABEL: add_v16i8_v16i16_zext:
488 ; CHECK: @ %bb.0: @ %entry
489 ; CHECK-NEXT: vmlav.u8 r0, q0, q1
490 ; CHECK-NEXT: uxth r0, r0
493 %xx = zext <16 x i8> %x to <16 x i16>
494 %yy = zext <16 x i8> %y to <16 x i16>
495 %m = mul <16 x i16> %xx, %yy
496 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
500 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
501 ; CHECK-LABEL: add_v16i8_v16i16_sext:
502 ; CHECK: @ %bb.0: @ %entry
503 ; CHECK-NEXT: vmlav.s8 r0, q0, q1
504 ; CHECK-NEXT: sxth r0, r0
507 %xx = sext <16 x i8> %x to <16 x i16>
508 %yy = sext <16 x i8> %y to <16 x i16>
509 %m = mul <16 x i16> %xx, %yy
510 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
514 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y) {
515 ; CHECK-LABEL: add_v16i8_v16i16_szext:
516 ; CHECK: @ %bb.0: @ %entry
517 ; CHECK-NEXT: .pad #32
518 ; CHECK-NEXT: sub sp, #32
519 ; CHECK-NEXT: add r0, sp, #16
520 ; CHECK-NEXT: mov r1, sp
521 ; CHECK-NEXT: vstrw.32 q1, [r0]
522 ; CHECK-NEXT: vstrw.32 q0, [r1]
523 ; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
524 ; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
525 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
526 ; CHECK-NEXT: vldrb.u16 q0, [r0]
527 ; CHECK-NEXT: vldrb.s16 q1, [r1]
528 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
529 ; CHECK-NEXT: sxth r0, r2
530 ; CHECK-NEXT: add sp, #32
533 %xx = sext <16 x i8> %x to <16 x i16>
534 %yy = zext <16 x i8> %y to <16 x i16>
535 %m = mul <16 x i16> %xx, %yy
536 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
540 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
541 ; CHECK-LABEL: add_v8i8_v8i16_zext:
542 ; CHECK: @ %bb.0: @ %entry
543 ; CHECK-NEXT: vmovlb.u8 q1, q1
544 ; CHECK-NEXT: vmovlb.u8 q0, q0
545 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
546 ; CHECK-NEXT: uxth r0, r0
549 %xx = zext <8 x i8> %x to <8 x i16>
550 %yy = zext <8 x i8> %y to <8 x i16>
551 %m = mul <8 x i16> %xx, %yy
552 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
556 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
557 ; CHECK-LABEL: add_v8i8_v8i16_sext:
558 ; CHECK: @ %bb.0: @ %entry
559 ; CHECK-NEXT: vmovlb.s8 q1, q1
560 ; CHECK-NEXT: vmovlb.s8 q0, q0
561 ; CHECK-NEXT: vmlav.u16 r0, q0, q1
562 ; CHECK-NEXT: sxth r0, r0
565 %xx = sext <8 x i8> %x to <8 x i16>
566 %yy = sext <8 x i8> %y to <8 x i16>
567 %m = mul <8 x i16> %xx, %yy
568 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
572 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
573 ; CHECK-LABEL: add_v16i8_v16i8:
574 ; CHECK: @ %bb.0: @ %entry
575 ; CHECK-NEXT: vmlav.u8 r0, q0, q1
576 ; CHECK-NEXT: uxtb r0, r0
579 %m = mul <16 x i8> %x, %y
580 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
584 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
585 ; CHECK-LABEL: add_v16i8_v16i64_zext:
586 ; CHECK: @ %bb.0: @ %entry
587 ; CHECK-NEXT: .pad #32
588 ; CHECK-NEXT: sub sp, #32
589 ; CHECK-NEXT: add r2, sp, #16
590 ; CHECK-NEXT: mov r3, sp
591 ; CHECK-NEXT: vstrw.32 q1, [r2]
592 ; CHECK-NEXT: vstrw.32 q0, [r3]
593 ; CHECK-NEXT: vldrb.u16 q0, [r2]
594 ; CHECK-NEXT: vldrb.u16 q1, [r3]
595 ; CHECK-NEXT: vmlalv.u16 r0, r1, q1, q0
596 ; CHECK-NEXT: vldrb.u16 q0, [r2, #8]
597 ; CHECK-NEXT: vldrb.u16 q1, [r3, #8]
598 ; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
599 ; CHECK-NEXT: add sp, #32
602 %xx = zext <16 x i8> %x to <16 x i64>
603 %yy = zext <16 x i8> %y to <16 x i64>
604 %m = mul <16 x i64> %xx, %yy
605 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
609 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
610 ; CHECK-LABEL: add_v16i8_v16i64_sext:
611 ; CHECK: @ %bb.0: @ %entry
612 ; CHECK-NEXT: .pad #32
613 ; CHECK-NEXT: sub sp, #32
614 ; CHECK-NEXT: add r2, sp, #16
615 ; CHECK-NEXT: mov r3, sp
616 ; CHECK-NEXT: vstrw.32 q1, [r2]
617 ; CHECK-NEXT: vstrw.32 q0, [r3]
618 ; CHECK-NEXT: vldrb.s16 q0, [r2]
619 ; CHECK-NEXT: vldrb.s16 q1, [r3]
620 ; CHECK-NEXT: vmlalv.s16 r0, r1, q1, q0
621 ; CHECK-NEXT: vldrb.s16 q0, [r2, #8]
622 ; CHECK-NEXT: vldrb.s16 q1, [r3, #8]
623 ; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
624 ; CHECK-NEXT: add sp, #32
627 %xx = sext <16 x i8> %x to <16 x i64>
628 %yy = sext <16 x i8> %y to <16 x i64>
629 %m = mul <16 x i64> %xx, %yy
630 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
634 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
635 ; CHECK-LABEL: add_v16i8_v16i64_zext_load:
636 ; CHECK: @ %bb.0: @ %entry
637 ; CHECK-NEXT: vldrb.u16 q0, [r1]
638 ; CHECK-NEXT: vldrb.u16 q1, [r0]
639 ; CHECK-NEXT: vmlalv.u16 r2, r3, q1, q0
640 ; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
641 ; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
642 ; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
643 ; CHECK-NEXT: mov r0, r2
644 ; CHECK-NEXT: mov r1, r3
647 %x = load <16 x i8>, <16 x i8>* %xp
648 %y = load <16 x i8>, <16 x i8>* %yp
649 %xx = zext <16 x i8> %x to <16 x i64>
650 %yy = zext <16 x i8> %y to <16 x i64>
651 %m = mul <16 x i64> %xx, %yy
652 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
656 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
657 ; CHECK-LABEL: add_v16i8_v16i64_sext_load:
658 ; CHECK: @ %bb.0: @ %entry
659 ; CHECK-NEXT: vldrb.s16 q0, [r1]
660 ; CHECK-NEXT: vldrb.s16 q1, [r0]
661 ; CHECK-NEXT: vmlalv.s16 r2, r3, q1, q0
662 ; CHECK-NEXT: vldrb.s16 q0, [r1, #8]
663 ; CHECK-NEXT: vldrb.s16 q1, [r0, #8]
664 ; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
665 ; CHECK-NEXT: mov r0, r2
666 ; CHECK-NEXT: mov r1, r3
669 %x = load <16 x i8>, <16 x i8>* %xp
670 %y = load <16 x i8>, <16 x i8>* %yp
671 %xx = sext <16 x i8> %x to <16 x i64>
672 %yy = sext <16 x i8> %y to <16 x i64>
673 %m = mul <16 x i64> %xx, %yy
674 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
678 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
679 ; CHECK-LABEL: add_v8i8_v8i64_zext:
680 ; CHECK: @ %bb.0: @ %entry
681 ; CHECK-NEXT: vmovlb.u8 q1, q1
682 ; CHECK-NEXT: vmovlb.u8 q0, q0
683 ; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
686 %xx = zext <8 x i8> %x to <8 x i64>
687 %yy = zext <8 x i8> %y to <8 x i64>
688 %m = mul <8 x i64> %xx, %yy
689 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
693 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
694 ; CHECK-LABEL: add_v8i8_v8i64_sext:
695 ; CHECK: @ %bb.0: @ %entry
696 ; CHECK-NEXT: vmovlb.s8 q1, q1
697 ; CHECK-NEXT: vmovlb.s8 q0, q0
698 ; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
701 %xx = sext <8 x i8> %x to <8 x i64>
702 %yy = sext <8 x i8> %y to <8 x i64>
703 %m = mul <8 x i64> %xx, %yy
704 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
708 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
709 ; CHECK-LABEL: add_v4i8_v4i64_zext:
710 ; CHECK: @ %bb.0: @ %entry
711 ; CHECK-NEXT: vmov.i32 q2, #0xff
712 ; CHECK-NEXT: vand q1, q1, q2
713 ; CHECK-NEXT: vand q0, q0, q2
714 ; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
717 %xx = zext <4 x i8> %x to <4 x i64>
718 %yy = zext <4 x i8> %y to <4 x i64>
719 %m = mul <4 x i64> %xx, %yy
720 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
724 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
725 ; CHECK-LABEL: add_v4i8_v4i64_sext:
726 ; CHECK: @ %bb.0: @ %entry
727 ; CHECK-NEXT: vmovlb.s8 q1, q1
728 ; CHECK-NEXT: vmovlb.s8 q0, q0
729 ; CHECK-NEXT: vmovlb.s16 q1, q1
730 ; CHECK-NEXT: vmovlb.s16 q0, q0
731 ; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
734 %xx = sext <4 x i8> %x to <4 x i64>
735 %yy = sext <4 x i8> %y to <4 x i64>
736 %m = mul <4 x i64> %xx, %yy
737 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
741 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
742 ; CHECK-LABEL: add_v4i8i16_v4i64_zext:
743 ; CHECK: @ %bb.0: @ %entry
744 ; CHECK-NEXT: vmov.i32 q2, #0xff
745 ; CHECK-NEXT: vmovlb.u16 q1, q1
746 ; CHECK-NEXT: vand q0, q0, q2
747 ; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
750 %xx = zext <4 x i8> %x to <4 x i64>
751 %yy = zext <4 x i16> %y to <4 x i64>
752 %m = mul <4 x i64> %xx, %yy
753 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
757 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
758 ; CHECK-LABEL: add_v4i8i16_v4i64_sext:
759 ; CHECK: @ %bb.0: @ %entry
760 ; CHECK-NEXT: vmovlb.s8 q0, q0
761 ; CHECK-NEXT: vmovlb.s16 q1, q1
762 ; CHECK-NEXT: vmovlb.s16 q0, q0
763 ; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
766 %xx = sext <4 x i8> %x to <4 x i64>
767 %yy = sext <4 x i16> %y to <4 x i64>
768 %m = mul <4 x i64> %xx, %yy
769 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
773 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
774 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
775 ; CHECK: @ %bb.0: @ %entry
776 ; CHECK-NEXT: vmov.i32 q2, #0xff
777 ; CHECK-NEXT: vmovlb.u16 q1, q1
778 ; CHECK-NEXT: vand q0, q0, q2
779 ; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
782 %xx = zext <4 x i8> %x to <4 x i32>
783 %yy = zext <4 x i16> %y to <4 x i32>
784 %mm = mul <4 x i32> %xx, %yy
785 %m = zext <4 x i32> %mm to <4 x i64>
786 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
790 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
791 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
792 ; CHECK: @ %bb.0: @ %entry
793 ; CHECK-NEXT: vmovlb.s8 q0, q0
794 ; CHECK-NEXT: vmovlb.s16 q1, q1
795 ; CHECK-NEXT: vmovlb.s16 q0, q0
796 ; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
799 %xx = sext <4 x i8> %x to <4 x i32>
800 %yy = sext <4 x i16> %y to <4 x i32>
801 %mm = mul <4 x i32> %xx, %yy
802 %m = sext <4 x i32> %mm to <4 x i64>
803 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
807 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
808 ; CHECK-LABEL: add_v2i8_v2i64_zext:
809 ; CHECK: @ %bb.0: @ %entry
810 ; CHECK-NEXT: vmov.i64 q2, #0xff
811 ; CHECK-NEXT: vand q1, q1, q2
812 ; CHECK-NEXT: vand q0, q0, q2
813 ; CHECK-NEXT: vmov r0, s6
814 ; CHECK-NEXT: vmov r1, s2
815 ; CHECK-NEXT: vmov r2, s4
816 ; CHECK-NEXT: vmov r3, s0
817 ; CHECK-NEXT: umull r0, r1, r1, r0
818 ; CHECK-NEXT: umull r2, r3, r3, r2
819 ; CHECK-NEXT: add r0, r2
820 ; CHECK-NEXT: orrs r1, r3
823 %xx = zext <2 x i8> %x to <2 x i64>
824 %yy = zext <2 x i8> %y to <2 x i64>
825 %m = mul <2 x i64> %xx, %yy
826 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
830 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
831 ; CHECK-LABEL: add_v2i8_v2i64_sext:
832 ; CHECK: @ %bb.0: @ %entry
833 ; CHECK-NEXT: vmov r0, s6
834 ; CHECK-NEXT: vmov r1, s2
835 ; CHECK-NEXT: vmov r2, s4
836 ; CHECK-NEXT: vmov r3, s0
837 ; CHECK-NEXT: sxtb r0, r0
838 ; CHECK-NEXT: sxtb r1, r1
839 ; CHECK-NEXT: smull r0, r1, r1, r0
840 ; CHECK-NEXT: sxtb r2, r2
841 ; CHECK-NEXT: sxtb r3, r3
842 ; CHECK-NEXT: smlal r0, r1, r3, r2
845 %xx = sext <2 x i8> %x to <2 x i64>
846 %yy = sext <2 x i8> %y to <2 x i64>
847 %m = mul <2 x i64> %xx, %yy
848 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
852 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
853 ; CHECK-LABEL: add_v2i64_v2i64:
854 ; CHECK: @ %bb.0: @ %entry
855 ; CHECK-NEXT: .save {r4, r5, r7, lr}
856 ; CHECK-NEXT: push {r4, r5, r7, lr}
857 ; CHECK-NEXT: vmov r0, lr, d3
858 ; CHECK-NEXT: vmov r2, r3, d1
859 ; CHECK-NEXT: umull r12, r1, r2, r0
860 ; CHECK-NEXT: mla r1, r2, lr, r1
861 ; CHECK-NEXT: mla lr, r3, r0, r1
862 ; CHECK-NEXT: vmov r0, r2, d2
863 ; CHECK-NEXT: vmov r3, r1, d0
864 ; CHECK-NEXT: umull r4, r5, r3, r0
865 ; CHECK-NEXT: mla r2, r3, r2, r5
866 ; CHECK-NEXT: mla r1, r1, r0, r2
867 ; CHECK-NEXT: adds.w r0, r4, r12
868 ; CHECK-NEXT: adc.w r1, r1, lr
869 ; CHECK-NEXT: pop {r4, r5, r7, pc}
871 %m = mul <2 x i64> %x, %y
872 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
876 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
877 ; CHECK-LABEL: add_v4i32_v4i32_acc:
878 ; CHECK: @ %bb.0: @ %entry
879 ; CHECK-NEXT: vmlava.u32 r0, q0, q1
882 %m = mul <4 x i32> %x, %y
883 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
888 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
889 ; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
890 ; CHECK: @ %bb.0: @ %entry
891 ; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
894 %xx = zext <4 x i32> %x to <4 x i64>
895 %yy = zext <4 x i32> %y to <4 x i64>
896 %m = mul <4 x i64> %xx, %yy
897 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
902 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
903 ; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
904 ; CHECK: @ %bb.0: @ %entry
905 ; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
908 %xx = sext <4 x i32> %x to <4 x i64>
909 %yy = sext <4 x i32> %y to <4 x i64>
910 %m = mul <4 x i64> %xx, %yy
911 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
916 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
917 ; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
918 ; CHECK: @ %bb.0: @ %entry
919 ; CHECK-NEXT: .save {r7, lr}
920 ; CHECK-NEXT: push {r7, lr}
921 ; CHECK-NEXT: vmullb.u32 q2, q0, q1
922 ; CHECK-NEXT: vmov lr, r12, d5
923 ; CHECK-NEXT: vmov r3, r2, d4
924 ; CHECK-NEXT: adds.w r3, r3, lr
925 ; CHECK-NEXT: adc.w r2, r2, r12
926 ; CHECK-NEXT: adds r0, r0, r3
927 ; CHECK-NEXT: adcs r1, r2
928 ; CHECK-NEXT: pop {r7, pc}
930 %xx = zext <2 x i32> %x to <2 x i64>
931 %yy = zext <2 x i32> %y to <2 x i64>
932 %m = mul <2 x i64> %xx, %yy
933 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
938 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
939 ; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
940 ; CHECK: @ %bb.0: @ %entry
941 ; CHECK-NEXT: .save {r7, lr}
942 ; CHECK-NEXT: push {r7, lr}
943 ; CHECK-NEXT: vmullb.s32 q2, q0, q1
944 ; CHECK-NEXT: vmov lr, r12, d5
945 ; CHECK-NEXT: vmov r3, r2, d4
946 ; CHECK-NEXT: adds.w r3, r3, lr
947 ; CHECK-NEXT: adc.w r2, r2, r12
948 ; CHECK-NEXT: adds r0, r0, r3
949 ; CHECK-NEXT: adcs r1, r2
950 ; CHECK-NEXT: pop {r7, pc}
952 %xx = sext <2 x i32> %x to <2 x i64>
953 %yy = sext <2 x i32> %y to <2 x i64>
954 %m = mul <2 x i64> %xx, %yy
955 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
960 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
961 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
962 ; CHECK: @ %bb.0: @ %entry
963 ; CHECK-NEXT: vmlava.u16 r0, q0, q1
966 %xx = zext <8 x i16> %x to <8 x i32>
967 %yy = zext <8 x i16> %y to <8 x i32>
968 %m = mul <8 x i32> %xx, %yy
969 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
974 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
975 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
976 ; CHECK: @ %bb.0: @ %entry
977 ; CHECK-NEXT: vmlava.s16 r0, q0, q1
980 %xx = sext <8 x i16> %x to <8 x i32>
981 %yy = sext <8 x i16> %y to <8 x i32>
982 %m = mul <8 x i32> %xx, %yy
983 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
988 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
989 ; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
990 ; CHECK: @ %bb.0: @ %entry
991 ; CHECK-NEXT: vmovlb.u16 q1, q1
992 ; CHECK-NEXT: vmovlb.u16 q0, q0
993 ; CHECK-NEXT: vmlava.u32 r0, q0, q1
996 %xx = zext <4 x i16> %x to <4 x i32>
997 %yy = zext <4 x i16> %y to <4 x i32>
998 %m = mul <4 x i32> %xx, %yy
999 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1004 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1005 ; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1006 ; CHECK: @ %bb.0: @ %entry
1007 ; CHECK-NEXT: vmovlb.s16 q1, q1
1008 ; CHECK-NEXT: vmovlb.s16 q0, q0
1009 ; CHECK-NEXT: vmlava.u32 r0, q0, q1
1012 %xx = sext <4 x i16> %x to <4 x i32>
1013 %yy = sext <4 x i16> %y to <4 x i32>
1014 %m = mul <4 x i32> %xx, %yy
1015 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1020 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
1021 ; CHECK-LABEL: add_v8i16_v8i16_acc:
1022 ; CHECK: @ %bb.0: @ %entry
1023 ; CHECK-NEXT: vmlava.u16 r0, q0, q1
1024 ; CHECK-NEXT: uxth r0, r0
1027 %m = mul <8 x i16> %x, %y
1028 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1033 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1034 ; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1035 ; CHECK: @ %bb.0: @ %entry
1036 ; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
1039 %xx = zext <8 x i16> %x to <8 x i64>
1040 %yy = zext <8 x i16> %y to <8 x i64>
1041 %m = mul <8 x i64> %xx, %yy
1042 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
1047 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1048 ; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1049 ; CHECK: @ %bb.0: @ %entry
1050 ; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
1053 %xx = sext <8 x i16> %x to <8 x i64>
1054 %yy = sext <8 x i16> %y to <8 x i64>
1055 %m = mul <8 x i64> %xx, %yy
1056 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
1061 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1062 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1063 ; CHECK: @ %bb.0: @ %entry
1064 ; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
1067 %xx = zext <8 x i16> %x to <8 x i32>
1068 %yy = zext <8 x i16> %y to <8 x i32>
1069 %m = mul <8 x i32> %xx, %yy
1070 %ma = zext <8 x i32> %m to <8 x i64>
1071 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1076 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1077 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1078 ; CHECK: @ %bb.0: @ %entry
1079 ; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
1082 %xx = sext <8 x i16> %x to <8 x i32>
1083 %yy = sext <8 x i16> %y to <8 x i32>
1084 %m = mul <8 x i32> %xx, %yy
1085 %ma = sext <8 x i32> %m to <8 x i64>
1086 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1091 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1092 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1093 ; CHECK: @ %bb.0: @ %entry
1094 ; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q0
1097 %xx = sext <8 x i16> %x to <8 x i32>
1098 %m = mul <8 x i32> %xx, %xx
1099 %ma = zext <8 x i32> %m to <8 x i64>
1100 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1105 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1106 ; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1107 ; CHECK: @ %bb.0: @ %entry
1108 ; CHECK-NEXT: .save {r7, lr}
1109 ; CHECK-NEXT: push {r7, lr}
1110 ; CHECK-NEXT: vmov.i64 q2, #0xffff
1111 ; CHECK-NEXT: vand q1, q1, q2
1112 ; CHECK-NEXT: vand q0, q0, q2
1113 ; CHECK-NEXT: vmov r2, s6
1114 ; CHECK-NEXT: vmov r3, s2
1115 ; CHECK-NEXT: vmov r12, s4
1116 ; CHECK-NEXT: umull r2, lr, r3, r2
1117 ; CHECK-NEXT: vmov r3, s0
1118 ; CHECK-NEXT: umlal r2, lr, r3, r12
1119 ; CHECK-NEXT: adds r0, r0, r2
1120 ; CHECK-NEXT: adc.w r1, r1, lr
1121 ; CHECK-NEXT: pop {r7, pc}
1123 %xx = zext <2 x i16> %x to <2 x i64>
1124 %yy = zext <2 x i16> %y to <2 x i64>
1125 %m = mul <2 x i64> %xx, %yy
1126 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1131 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1132 ; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1133 ; CHECK: @ %bb.0: @ %entry
1134 ; CHECK-NEXT: .save {r7, lr}
1135 ; CHECK-NEXT: push {r7, lr}
1136 ; CHECK-NEXT: vmov r2, s6
1137 ; CHECK-NEXT: vmov r3, s2
1138 ; CHECK-NEXT: sxth r2, r2
1139 ; CHECK-NEXT: sxth r3, r3
1140 ; CHECK-NEXT: smull r2, r12, r3, r2
1141 ; CHECK-NEXT: vmov r3, s4
1142 ; CHECK-NEXT: sxth.w lr, r3
1143 ; CHECK-NEXT: vmov r3, s0
1144 ; CHECK-NEXT: sxth r3, r3
1145 ; CHECK-NEXT: smlal r2, r12, r3, lr
1146 ; CHECK-NEXT: adds r0, r0, r2
1147 ; CHECK-NEXT: adc.w r1, r1, r12
1148 ; CHECK-NEXT: pop {r7, pc}
1150 %xx = sext <2 x i16> %x to <2 x i64>
1151 %yy = sext <2 x i16> %y to <2 x i64>
1152 %m = mul <2 x i64> %xx, %yy
1153 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1158 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1159 ; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
1160 ; CHECK: @ %bb.0: @ %entry
1161 ; CHECK-NEXT: vmlava.u8 r0, q0, q1
1164 %xx = zext <16 x i8> %x to <16 x i32>
1165 %yy = zext <16 x i8> %y to <16 x i32>
1166 %m = mul <16 x i32> %xx, %yy
1167 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
1172 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1173 ; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
1174 ; CHECK: @ %bb.0: @ %entry
1175 ; CHECK-NEXT: vmlava.s8 r0, q0, q1
1178 %xx = sext <16 x i8> %x to <16 x i32>
1179 %yy = sext <16 x i8> %y to <16 x i32>
1180 %m = mul <16 x i32> %xx, %yy
1181 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
1186 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1187 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
1188 ; CHECK: @ %bb.0: @ %entry
1189 ; CHECK-NEXT: vmlava.u8 r0, q0, q1
1192 %xx = zext <16 x i8> %x to <16 x i16>
1193 %yy = zext <16 x i8> %y to <16 x i16>
1194 %m = mul <16 x i16> %xx, %yy
1195 %ma = zext <16 x i16> %m to <16 x i32>
1196 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1201 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1202 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
1203 ; CHECK: @ %bb.0: @ %entry
1204 ; CHECK-NEXT: vmlava.s8 r0, q0, q1
1207 %xx = sext <16 x i8> %x to <16 x i16>
1208 %yy = sext <16 x i8> %y to <16 x i16>
1209 %m = mul <16 x i16> %xx, %yy
1210 %ma = sext <16 x i16> %m to <16 x i32>
1211 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1216 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, i32 %a) {
1217 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
1218 ; CHECK: @ %bb.0: @ %entry
1219 ; CHECK-NEXT: vmlava.s8 r0, q0, q0
1222 %xx = sext <16 x i8> %x to <16 x i16>
1223 %m = mul <16 x i16> %xx, %xx
1224 %ma = zext <16 x i16> %m to <16 x i32>
1225 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1230 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1231 ; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
1232 ; CHECK: @ %bb.0: @ %entry
1233 ; CHECK-NEXT: vmov.i32 q2, #0xff
1234 ; CHECK-NEXT: vand q1, q1, q2
1235 ; CHECK-NEXT: vand q0, q0, q2
1236 ; CHECK-NEXT: vmlava.u32 r0, q0, q1
1239 %xx = zext <4 x i8> %x to <4 x i32>
1240 %yy = zext <4 x i8> %y to <4 x i32>
1241 %m = mul <4 x i32> %xx, %yy
1242 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1247 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1248 ; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
1249 ; CHECK: @ %bb.0: @ %entry
1250 ; CHECK-NEXT: vmovlb.s8 q1, q1
1251 ; CHECK-NEXT: vmovlb.s8 q0, q0
1252 ; CHECK-NEXT: vmovlb.s16 q1, q1
1253 ; CHECK-NEXT: vmovlb.s16 q0, q0
1254 ; CHECK-NEXT: vmlava.u32 r0, q0, q1
1257 %xx = sext <4 x i8> %x to <4 x i32>
1258 %yy = sext <4 x i8> %y to <4 x i32>
1259 %m = mul <4 x i32> %xx, %yy
1260 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1265 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1266 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1267 ; CHECK: @ %bb.0: @ %entry
1268 ; CHECK-NEXT: vmlava.u8 r0, q0, q1
1269 ; CHECK-NEXT: uxth r0, r0
1272 %xx = zext <16 x i8> %x to <16 x i16>
1273 %yy = zext <16 x i8> %y to <16 x i16>
1274 %m = mul <16 x i16> %xx, %yy
1275 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
1280 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1281 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1282 ; CHECK: @ %bb.0: @ %entry
1283 ; CHECK-NEXT: vmlava.s8 r0, q0, q1
1284 ; CHECK-NEXT: sxth r0, r0
1287 %xx = sext <16 x i8> %x to <16 x i16>
1288 %yy = sext <16 x i8> %y to <16 x i16>
1289 %m = mul <16 x i16> %xx, %yy
1290 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
1295 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1296 ; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
1297 ; CHECK: @ %bb.0: @ %entry
1298 ; CHECK-NEXT: vmovlb.u8 q1, q1
1299 ; CHECK-NEXT: vmovlb.u8 q0, q0
1300 ; CHECK-NEXT: vmlava.u16 r0, q0, q1
1301 ; CHECK-NEXT: uxth r0, r0
1304 %xx = zext <8 x i8> %x to <8 x i16>
1305 %yy = zext <8 x i8> %y to <8 x i16>
1306 %m = mul <8 x i16> %xx, %yy
1307 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1312 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1313 ; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
1314 ; CHECK: @ %bb.0: @ %entry
1315 ; CHECK-NEXT: vmovlb.s8 q1, q1
1316 ; CHECK-NEXT: vmovlb.s8 q0, q0
1317 ; CHECK-NEXT: vmlava.u16 r0, q0, q1
1318 ; CHECK-NEXT: sxth r0, r0
1321 %xx = sext <8 x i8> %x to <8 x i16>
1322 %yy = sext <8 x i8> %y to <8 x i16>
1323 %m = mul <8 x i16> %xx, %yy
1324 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1329 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
1330 ; CHECK-LABEL: add_v16i8_v16i8_acc:
1331 ; CHECK: @ %bb.0: @ %entry
1332 ; CHECK-NEXT: vmlava.u8 r0, q0, q1
1333 ; CHECK-NEXT: uxtb r0, r0
1336 %m = mul <16 x i8> %x, %y
1337 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
1342 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1343 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1344 ; CHECK: @ %bb.0: @ %entry
1345 ; CHECK-NEXT: .pad #32
1346 ; CHECK-NEXT: sub sp, #32
1347 ; CHECK-NEXT: add r2, sp, #16
1348 ; CHECK-NEXT: mov r3, sp
1349 ; CHECK-NEXT: vstrw.32 q1, [r2]
1350 ; CHECK-NEXT: vstrw.32 q0, [r3]
1351 ; CHECK-NEXT: vldrb.u16 q0, [r2]
1352 ; CHECK-NEXT: vldrb.u16 q1, [r3]
1353 ; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
1354 ; CHECK-NEXT: vldrb.u16 q0, [r2, #8]
1355 ; CHECK-NEXT: vldrb.u16 q1, [r3, #8]
1356 ; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
1357 ; CHECK-NEXT: add sp, #32
1360 %xx = zext <16 x i8> %x to <16 x i64>
1361 %yy = zext <16 x i8> %y to <16 x i64>
1362 %m = mul <16 x i64> %xx, %yy
1363 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1368 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1369 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1370 ; CHECK: @ %bb.0: @ %entry
1371 ; CHECK-NEXT: .pad #32
1372 ; CHECK-NEXT: sub sp, #32
1373 ; CHECK-NEXT: add r2, sp, #16
1374 ; CHECK-NEXT: mov r3, sp
1375 ; CHECK-NEXT: vstrw.32 q1, [r2]
1376 ; CHECK-NEXT: vstrw.32 q0, [r3]
1377 ; CHECK-NEXT: vldrb.s16 q0, [r2]
1378 ; CHECK-NEXT: vldrb.s16 q1, [r3]
1379 ; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
1380 ; CHECK-NEXT: vldrb.s16 q0, [r2, #8]
1381 ; CHECK-NEXT: vldrb.s16 q1, [r3, #8]
1382 ; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
1383 ; CHECK-NEXT: add sp, #32
1386 %xx = sext <16 x i8> %x to <16 x i64>
1387 %yy = sext <16 x i8> %y to <16 x i64>
1388 %m = mul <16 x i64> %xx, %yy
1389 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1394 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
1395 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext_load:
1396 ; CHECK: @ %bb.0: @ %entry
1397 ; CHECK-NEXT: vldrb.u16 q0, [r1]
1398 ; CHECK-NEXT: vldrb.u16 q1, [r0]
1399 ; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
1400 ; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
1401 ; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
1402 ; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
1403 ; CHECK-NEXT: mov r0, r2
1404 ; CHECK-NEXT: mov r1, r3
1407 %x = load <16 x i8>, <16 x i8>* %xp
1408 %y = load <16 x i8>, <16 x i8>* %yp
1409 %xx = zext <16 x i8> %x to <16 x i64>
1410 %yy = zext <16 x i8> %y to <16 x i64>
1411 %m = mul <16 x i64> %xx, %yy
1412 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1417 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
1418 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext_load:
1419 ; CHECK: @ %bb.0: @ %entry
1420 ; CHECK-NEXT: vldrb.s16 q0, [r1]
1421 ; CHECK-NEXT: vldrb.s16 q1, [r0]
1422 ; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
1423 ; CHECK-NEXT: vldrb.s16 q0, [r1, #8]
1424 ; CHECK-NEXT: vldrb.s16 q1, [r0, #8]
1425 ; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
1426 ; CHECK-NEXT: mov r0, r2
1427 ; CHECK-NEXT: mov r1, r3
1430 %x = load <16 x i8>, <16 x i8>* %xp
1431 %y = load <16 x i8>, <16 x i8>* %yp
1432 %xx = sext <16 x i8> %x to <16 x i64>
1433 %yy = sext <16 x i8> %y to <16 x i64>
1434 %m = mul <16 x i64> %xx, %yy
1435 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1440 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1441 ; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1442 ; CHECK: @ %bb.0: @ %entry
1443 ; CHECK-NEXT: .save {r7, lr}
1444 ; CHECK-NEXT: push {r7, lr}
1445 ; CHECK-NEXT: vmov.i64 q2, #0xff
1446 ; CHECK-NEXT: vand q1, q1, q2
1447 ; CHECK-NEXT: vand q0, q0, q2
1448 ; CHECK-NEXT: vmov r2, s6
1449 ; CHECK-NEXT: vmov r3, s2
1450 ; CHECK-NEXT: umull r12, lr, r3, r2
1451 ; CHECK-NEXT: vmov r2, s4
1452 ; CHECK-NEXT: vmov r3, s0
1453 ; CHECK-NEXT: umull r2, r3, r3, r2
1454 ; CHECK-NEXT: add r2, r12
1455 ; CHECK-NEXT: orr.w r3, r3, lr
1456 ; CHECK-NEXT: adds r0, r0, r2
1457 ; CHECK-NEXT: adcs r1, r3
1458 ; CHECK-NEXT: pop {r7, pc}
1460 %xx = zext <2 x i8> %x to <2 x i64>
1461 %yy = zext <2 x i8> %y to <2 x i64>
1462 %m = mul <2 x i64> %xx, %yy
1463 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1468 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1469 ; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1470 ; CHECK: @ %bb.0: @ %entry
1471 ; CHECK-NEXT: .save {r7, lr}
1472 ; CHECK-NEXT: push {r7, lr}
1473 ; CHECK-NEXT: vmov r2, s6
1474 ; CHECK-NEXT: vmov r3, s2
1475 ; CHECK-NEXT: sxtb r2, r2
1476 ; CHECK-NEXT: sxtb r3, r3
1477 ; CHECK-NEXT: smull r2, r12, r3, r2
1478 ; CHECK-NEXT: vmov r3, s4
1479 ; CHECK-NEXT: sxtb.w lr, r3
1480 ; CHECK-NEXT: vmov r3, s0
1481 ; CHECK-NEXT: sxtb r3, r3
1482 ; CHECK-NEXT: smlal r2, r12, r3, lr
1483 ; CHECK-NEXT: adds r0, r0, r2
1484 ; CHECK-NEXT: adc.w r1, r1, r12
1485 ; CHECK-NEXT: pop {r7, pc}
1487 %xx = sext <2 x i8> %x to <2 x i64>
1488 %yy = sext <2 x i8> %y to <2 x i64>
1489 %m = mul <2 x i64> %xx, %yy
1490 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1495 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
1496 ; CHECK-LABEL: add_v2i64_v2i64_acc:
1497 ; CHECK: @ %bb.0: @ %entry
1498 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
1499 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1500 ; CHECK-NEXT: vmov r2, r12, d3
1501 ; CHECK-NEXT: vmov r3, lr, d1
1502 ; CHECK-NEXT: vmov r4, r6, d0
1503 ; CHECK-NEXT: umull r8, r5, r3, r2
1504 ; CHECK-NEXT: mla r3, r3, r12, r5
1505 ; CHECK-NEXT: mla r12, lr, r2, r3
1506 ; CHECK-NEXT: vmov r3, r5, d2
1507 ; CHECK-NEXT: umull r7, r2, r4, r3
1508 ; CHECK-NEXT: mla r2, r4, r5, r2
1509 ; CHECK-NEXT: mla r2, r6, r3, r2
1510 ; CHECK-NEXT: adds.w r3, r7, r8
1511 ; CHECK-NEXT: adc.w r2, r2, r12
1512 ; CHECK-NEXT: adds r0, r0, r3
1513 ; CHECK-NEXT: adcs r1, r2
1514 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1516 %m = mul <2 x i64> %x, %y
1517 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1522 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1523 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1524 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1525 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1526 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1527 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1528 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1529 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1530 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1531 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)