1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
5 ; CHECK-LABEL: add_v4i32_v4i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vpt.i32 eq, q2, zr
8 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
11 %c = icmp eq <4 x i32> %b, zeroinitializer
12 %m = mul <4 x i32> %x, %y
13 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
14 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
18 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
19 ; CHECK-LABEL: add_v4i32_v4i64_zext:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vpt.i32 eq, q2, zr
22 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
25 %c = icmp eq <4 x i32> %b, zeroinitializer
26 %xx = zext <4 x i32> %x to <4 x i64>
27 %yy = zext <4 x i32> %y to <4 x i64>
28 %m = mul <4 x i64> %xx, %yy
29 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
30 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
34 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
35 ; CHECK-LABEL: add_v4i32_v4i64_sext:
36 ; CHECK: @ %bb.0: @ %entry
37 ; CHECK-NEXT: vpt.i32 eq, q2, zr
38 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
41 %c = icmp eq <4 x i32> %b, zeroinitializer
42 %xx = sext <4 x i32> %x to <4 x i64>
43 %yy = sext <4 x i32> %y to <4 x i64>
44 %m = mul <4 x i64> %xx, %yy
45 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
46 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
50 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
51 ; CHECK-LABEL: add_v2i32_v2i64_zext:
52 ; CHECK: @ %bb.0: @ %entry
53 ; CHECK-NEXT: vmov r0, s8
54 ; CHECK-NEXT: movs r1, #0
55 ; CHECK-NEXT: vmullb.u32 q3, q0, q1
56 ; CHECK-NEXT: vmov.i32 q0, #0x0
57 ; CHECK-NEXT: cmp r0, #0
58 ; CHECK-NEXT: csetm r0, eq
59 ; CHECK-NEXT: bfi r1, r0, #0, #8
60 ; CHECK-NEXT: vmov r0, s10
61 ; CHECK-NEXT: cmp r0, #0
62 ; CHECK-NEXT: csetm r0, eq
63 ; CHECK-NEXT: bfi r1, r0, #8, #8
64 ; CHECK-NEXT: vmsr p0, r1
65 ; CHECK-NEXT: vpsel q0, q3, q0
66 ; CHECK-NEXT: vmov r0, r1, d1
67 ; CHECK-NEXT: vmov r2, r3, d0
68 ; CHECK-NEXT: adds r0, r0, r2
69 ; CHECK-NEXT: adcs r1, r3
72 %c = icmp eq <2 x i32> %b, zeroinitializer
73 %xx = zext <2 x i32> %x to <2 x i64>
74 %yy = zext <2 x i32> %y to <2 x i64>
75 %m = mul <2 x i64> %xx, %yy
76 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
77 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
81 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
82 ; CHECK-LABEL: add_v2i32_v2i64_sext:
83 ; CHECK: @ %bb.0: @ %entry
84 ; CHECK-NEXT: vmov r0, s8
85 ; CHECK-NEXT: movs r1, #0
86 ; CHECK-NEXT: vmullb.s32 q3, q0, q1
87 ; CHECK-NEXT: vmov.i32 q0, #0x0
88 ; CHECK-NEXT: cmp r0, #0
89 ; CHECK-NEXT: csetm r0, eq
90 ; CHECK-NEXT: bfi r1, r0, #0, #8
91 ; CHECK-NEXT: vmov r0, s10
92 ; CHECK-NEXT: cmp r0, #0
93 ; CHECK-NEXT: csetm r0, eq
94 ; CHECK-NEXT: bfi r1, r0, #8, #8
95 ; CHECK-NEXT: vmsr p0, r1
96 ; CHECK-NEXT: vpsel q0, q3, q0
97 ; CHECK-NEXT: vmov r0, r1, d1
98 ; CHECK-NEXT: vmov r2, r3, d0
99 ; CHECK-NEXT: adds r0, r0, r2
100 ; CHECK-NEXT: adcs r1, r3
103 %c = icmp eq <2 x i32> %b, zeroinitializer
104 %xx = sext <2 x i32> %x to <2 x i64>
105 %yy = sext <2 x i32> %y to <2 x i64>
106 %m = mul <2 x i64> %xx, %yy
107 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
108 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
112 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
113 ; CHECK-LABEL: add_v8i16_v8i32_zext:
114 ; CHECK: @ %bb.0: @ %entry
115 ; CHECK-NEXT: vpt.i16 eq, q2, zr
116 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
119 %c = icmp eq <8 x i16> %b, zeroinitializer
120 %xx = zext <8 x i16> %x to <8 x i32>
121 %yy = zext <8 x i16> %y to <8 x i32>
122 %m = mul <8 x i32> %xx, %yy
123 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
124 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
128 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
129 ; CHECK-LABEL: add_v8i16_v8i32_sext:
130 ; CHECK: @ %bb.0: @ %entry
131 ; CHECK-NEXT: vpt.i16 eq, q2, zr
132 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
135 %c = icmp eq <8 x i16> %b, zeroinitializer
136 %xx = sext <8 x i16> %x to <8 x i32>
137 %yy = sext <8 x i16> %y to <8 x i32>
138 %m = mul <8 x i32> %xx, %yy
139 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
140 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
144 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
145 ; CHECK-LABEL: add_v4i16_v4i32_zext:
146 ; CHECK: @ %bb.0: @ %entry
147 ; CHECK-NEXT: vmovlb.u16 q2, q2
148 ; CHECK-NEXT: vmovlb.u16 q1, q1
149 ; CHECK-NEXT: vmovlb.u16 q0, q0
150 ; CHECK-NEXT: vpt.i32 eq, q2, zr
151 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
154 %c = icmp eq <4 x i16> %b, zeroinitializer
155 %xx = zext <4 x i16> %x to <4 x i32>
156 %yy = zext <4 x i16> %y to <4 x i32>
157 %m = mul <4 x i32> %xx, %yy
158 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
159 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
163 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
164 ; CHECK-LABEL: add_v4i16_v4i32_sext:
165 ; CHECK: @ %bb.0: @ %entry
166 ; CHECK-NEXT: vmovlb.u16 q2, q2
167 ; CHECK-NEXT: vmovlb.s16 q1, q1
168 ; CHECK-NEXT: vmovlb.s16 q0, q0
169 ; CHECK-NEXT: vpt.i32 eq, q2, zr
170 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
173 %c = icmp eq <4 x i16> %b, zeroinitializer
174 %xx = sext <4 x i16> %x to <4 x i32>
175 %yy = sext <4 x i16> %y to <4 x i32>
176 %m = mul <4 x i32> %xx, %yy
177 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
178 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
182 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
183 ; CHECK-LABEL: add_v8i16_v8i16:
184 ; CHECK: @ %bb.0: @ %entry
185 ; CHECK-NEXT: vpt.i16 eq, q2, zr
186 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
187 ; CHECK-NEXT: uxth r0, r0
190 %c = icmp eq <8 x i16> %b, zeroinitializer
191 %m = mul <8 x i16> %x, %y
192 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
193 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
197 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
198 ; CHECK-LABEL: add_v8i16_v8i64_zext:
199 ; CHECK: @ %bb.0: @ %entry
200 ; CHECK-NEXT: vpt.i16 eq, q2, zr
201 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
204 %c = icmp eq <8 x i16> %b, zeroinitializer
205 %xx = zext <8 x i16> %x to <8 x i64>
206 %yy = zext <8 x i16> %y to <8 x i64>
207 %m = mul <8 x i64> %xx, %yy
208 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
209 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
213 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
214 ; CHECK-LABEL: add_v8i16_v8i64_sext:
215 ; CHECK: @ %bb.0: @ %entry
216 ; CHECK-NEXT: vpt.i16 eq, q2, zr
217 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
220 %c = icmp eq <8 x i16> %b, zeroinitializer
221 %xx = sext <8 x i16> %x to <8 x i64>
222 %yy = sext <8 x i16> %y to <8 x i64>
223 %m = mul <8 x i64> %xx, %yy
224 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
225 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
229 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
230 ; CHECK-LABEL: add_v8i8i16_v8i64_zext:
231 ; CHECK: @ %bb.0: @ %entry
232 ; CHECK-NEXT: vmovlb.u8 q1, q1
233 ; CHECK-NEXT: vpt.i16 eq, q2, zr
234 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
237 %c = icmp eq <8 x i16> %b, zeroinitializer
238 %xx = zext <8 x i16> %x to <8 x i64>
239 %yy = zext <8 x i8> %y to <8 x i64>
240 %m = mul <8 x i64> %xx, %yy
241 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
242 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
246 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
247 ; CHECK-LABEL: add_v8i8i16_v8i64_sext:
248 ; CHECK: @ %bb.0: @ %entry
249 ; CHECK-NEXT: vmovlb.s8 q1, q1
250 ; CHECK-NEXT: vpt.i16 eq, q2, zr
251 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
254 %c = icmp eq <8 x i16> %b, zeroinitializer
255 %xx = sext <8 x i16> %x to <8 x i64>
256 %yy = sext <8 x i8> %y to <8 x i64>
257 %m = mul <8 x i64> %xx, %yy
258 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
259 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
263 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
264 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
265 ; CHECK: @ %bb.0: @ %entry
266 ; CHECK-NEXT: vpt.i16 eq, q2, zr
267 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
270 %c = icmp eq <8 x i16> %b, zeroinitializer
271 %xx = zext <8 x i16> %x to <8 x i32>
272 %yy = zext <8 x i16> %y to <8 x i32>
273 %m = mul <8 x i32> %xx, %yy
274 %ma = zext <8 x i32> %m to <8 x i64>
275 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
276 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
280 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
281 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
282 ; CHECK: @ %bb.0: @ %entry
283 ; CHECK-NEXT: vpt.i16 eq, q2, zr
284 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
287 %c = icmp eq <8 x i16> %b, zeroinitializer
288 %xx = sext <8 x i16> %x to <8 x i32>
289 %yy = sext <8 x i16> %y to <8 x i32>
290 %m = mul <8 x i32> %xx, %yy
291 %ma = sext <8 x i32> %m to <8 x i64>
292 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
293 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
297 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
298 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
299 ; CHECK: @ %bb.0: @ %entry
300 ; CHECK-NEXT: vpt.i16 eq, q2, zr
301 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0
304 %c = icmp eq <8 x i16> %b, zeroinitializer
305 %xx = sext <8 x i16> %x to <8 x i32>
306 %m = mul <8 x i32> %xx, %xx
307 %ma = zext <8 x i32> %m to <8 x i64>
308 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
309 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
313 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
314 ; CHECK-LABEL: add_v4i16_v4i64_zext:
315 ; CHECK: @ %bb.0: @ %entry
316 ; CHECK-NEXT: vmovlb.u16 q2, q2
317 ; CHECK-NEXT: vmovlb.u16 q1, q1
318 ; CHECK-NEXT: vmovlb.u16 q0, q0
319 ; CHECK-NEXT: vpt.i32 eq, q2, zr
320 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
323 %c = icmp eq <4 x i16> %b, zeroinitializer
324 %xx = zext <4 x i16> %x to <4 x i64>
325 %yy = zext <4 x i16> %y to <4 x i64>
326 %m = mul <4 x i64> %xx, %yy
327 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
328 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
332 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
333 ; CHECK-LABEL: add_v4i16_v4i64_sext:
334 ; CHECK: @ %bb.0: @ %entry
335 ; CHECK-NEXT: vmovlb.u16 q2, q2
336 ; CHECK-NEXT: vmovlb.s16 q1, q1
337 ; CHECK-NEXT: vmovlb.s16 q0, q0
338 ; CHECK-NEXT: vpt.i32 eq, q2, zr
339 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
342 %c = icmp eq <4 x i16> %b, zeroinitializer
343 %xx = sext <4 x i16> %x to <4 x i64>
344 %yy = sext <4 x i16> %y to <4 x i64>
345 %m = mul <4 x i64> %xx, %yy
346 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
347 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
351 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
352 ; CHECK-LABEL: add_v2i16_v2i64_zext:
353 ; CHECK: @ %bb.0: @ %entry
354 ; CHECK-NEXT: vmov.i64 q3, #0xffff
355 ; CHECK-NEXT: vand q1, q1, q3
356 ; CHECK-NEXT: vand q0, q0, q3
357 ; CHECK-NEXT: vmov r0, s6
358 ; CHECK-NEXT: vmov r1, s2
359 ; CHECK-NEXT: vmov r2, s4
360 ; CHECK-NEXT: vand q1, q2, q3
361 ; CHECK-NEXT: vmov r3, s0
362 ; CHECK-NEXT: umull r0, r1, r1, r0
363 ; CHECK-NEXT: umull r2, r3, r3, r2
364 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
365 ; CHECK-NEXT: vmov r0, s4
366 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
367 ; CHECK-NEXT: movs r1, #0
368 ; CHECK-NEXT: cmp r0, #0
369 ; CHECK-NEXT: csetm r0, eq
370 ; CHECK-NEXT: bfi r1, r0, #0, #8
371 ; CHECK-NEXT: vmov r0, s6
372 ; CHECK-NEXT: vmov.i32 q1, #0x0
373 ; CHECK-NEXT: cmp r0, #0
374 ; CHECK-NEXT: csetm r0, eq
375 ; CHECK-NEXT: bfi r1, r0, #8, #8
376 ; CHECK-NEXT: vmsr p0, r1
377 ; CHECK-NEXT: vpsel q0, q0, q1
378 ; CHECK-NEXT: vmov r0, r1, d1
379 ; CHECK-NEXT: vmov r2, r3, d0
380 ; CHECK-NEXT: adds r0, r0, r2
381 ; CHECK-NEXT: adcs r1, r3
384 %c = icmp eq <2 x i16> %b, zeroinitializer
385 %xx = zext <2 x i16> %x to <2 x i64>
386 %yy = zext <2 x i16> %y to <2 x i64>
387 %m = mul <2 x i64> %xx, %yy
388 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
389 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
393 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
394 ; CHECK-LABEL: add_v2i16_v2i64_sext:
395 ; CHECK: @ %bb.0: @ %entry
396 ; CHECK-NEXT: vmov.i32 q3, #0xffff
397 ; CHECK-NEXT: movs r1, #0
398 ; CHECK-NEXT: vand q2, q2, q3
399 ; CHECK-NEXT: vmov r2, s4
400 ; CHECK-NEXT: vmov r0, s8
401 ; CHECK-NEXT: vmov r3, s0
402 ; CHECK-NEXT: cmp r0, #0
403 ; CHECK-NEXT: sxth r2, r2
404 ; CHECK-NEXT: csetm r0, eq
405 ; CHECK-NEXT: bfi r1, r0, #0, #8
406 ; CHECK-NEXT: vmov r0, s10
407 ; CHECK-NEXT: sxth r3, r3
408 ; CHECK-NEXT: smull r2, r3, r3, r2
409 ; CHECK-NEXT: cmp r0, #0
410 ; CHECK-NEXT: csetm r0, eq
411 ; CHECK-NEXT: bfi r1, r0, #8, #8
412 ; CHECK-NEXT: vmov r0, s6
413 ; CHECK-NEXT: vmsr p0, r1
414 ; CHECK-NEXT: vmov r1, s2
415 ; CHECK-NEXT: vmov.i32 q1, #0x0
416 ; CHECK-NEXT: sxth r0, r0
417 ; CHECK-NEXT: sxth r1, r1
418 ; CHECK-NEXT: smull r0, r1, r1, r0
419 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
420 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
421 ; CHECK-NEXT: vpsel q0, q0, q1
422 ; CHECK-NEXT: vmov r0, r1, d1
423 ; CHECK-NEXT: vmov r2, r3, d0
424 ; CHECK-NEXT: adds r0, r0, r2
425 ; CHECK-NEXT: adcs r1, r3
428 %c = icmp eq <2 x i16> %b, zeroinitializer
429 %xx = sext <2 x i16> %x to <2 x i64>
430 %yy = sext <2 x i16> %y to <2 x i64>
431 %m = mul <2 x i64> %xx, %yy
432 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
433 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
437 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
438 ; CHECK-LABEL: add_v16i8_v16i32_zext:
439 ; CHECK: @ %bb.0: @ %entry
440 ; CHECK-NEXT: vpt.i8 eq, q2, zr
441 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
444 %c = icmp eq <16 x i8> %b, zeroinitializer
445 %xx = zext <16 x i8> %x to <16 x i32>
446 %yy = zext <16 x i8> %y to <16 x i32>
447 %m = mul <16 x i32> %xx, %yy
448 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
449 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
453 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
454 ; CHECK-LABEL: add_v16i8_v16i32_sext:
455 ; CHECK: @ %bb.0: @ %entry
456 ; CHECK-NEXT: vpt.i8 eq, q2, zr
457 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
460 %c = icmp eq <16 x i8> %b, zeroinitializer
461 %xx = sext <16 x i8> %x to <16 x i32>
462 %yy = sext <16 x i8> %y to <16 x i32>
463 %m = mul <16 x i32> %xx, %yy
464 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
465 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
469 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
470 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
471 ; CHECK: @ %bb.0: @ %entry
472 ; CHECK-NEXT: vpt.i8 eq, q2, zr
473 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
476 %c = icmp eq <16 x i8> %b, zeroinitializer
477 %xx = zext <16 x i8> %x to <16 x i16>
478 %yy = zext <16 x i8> %y to <16 x i16>
479 %m = mul <16 x i16> %xx, %yy
480 %ma = zext <16 x i16> %m to <16 x i32>
481 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
482 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
486 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
487 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
488 ; CHECK: @ %bb.0: @ %entry
489 ; CHECK-NEXT: vpt.i8 eq, q2, zr
490 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
493 %c = icmp eq <16 x i8> %b, zeroinitializer
494 %xx = sext <16 x i8> %x to <16 x i16>
495 %yy = sext <16 x i8> %y to <16 x i16>
496 %m = mul <16 x i16> %xx, %yy
497 %ma = sext <16 x i16> %m to <16 x i32>
498 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
499 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
503 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
504 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
505 ; CHECK: @ %bb.0: @ %entry
506 ; CHECK-NEXT: vpt.i8 eq, q2, zr
507 ; CHECK-NEXT: vmlavt.s8 r0, q0, q0
510 %c = icmp eq <16 x i8> %b, zeroinitializer
511 %xx = sext <16 x i8> %x to <16 x i16>
512 %m = mul <16 x i16> %xx, %xx
513 %ma = zext <16 x i16> %m to <16 x i32>
514 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
515 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
519 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
520 ; CHECK-LABEL: add_v8i8_v8i32_zext:
521 ; CHECK: @ %bb.0: @ %entry
522 ; CHECK-NEXT: vmovlb.u8 q2, q2
523 ; CHECK-NEXT: vmovlb.u8 q1, q1
524 ; CHECK-NEXT: vmovlb.u8 q0, q0
525 ; CHECK-NEXT: vpt.i16 eq, q2, zr
526 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
529 %c = icmp eq <8 x i8> %b, zeroinitializer
530 %xx = zext <8 x i8> %x to <8 x i32>
531 %yy = zext <8 x i8> %y to <8 x i32>
532 %m = mul <8 x i32> %xx, %yy
533 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
534 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
538 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
539 ; CHECK-LABEL: add_v8i8_v8i32_sext:
540 ; CHECK: @ %bb.0: @ %entry
541 ; CHECK-NEXT: vmovlb.u8 q2, q2
542 ; CHECK-NEXT: vmovlb.s8 q1, q1
543 ; CHECK-NEXT: vmovlb.s8 q0, q0
544 ; CHECK-NEXT: vpt.i16 eq, q2, zr
545 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
548 %c = icmp eq <8 x i8> %b, zeroinitializer
549 %xx = sext <8 x i8> %x to <8 x i32>
550 %yy = sext <8 x i8> %y to <8 x i32>
551 %m = mul <8 x i32> %xx, %yy
552 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
553 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
557 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
558 ; CHECK-LABEL: add_v8i8i16_v8i32_zext:
559 ; CHECK: @ %bb.0: @ %entry
560 ; CHECK-NEXT: vmovlb.u8 q2, q2
561 ; CHECK-NEXT: vmovlb.u8 q0, q0
562 ; CHECK-NEXT: vpt.i16 eq, q2, zr
563 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
566 %c = icmp eq <8 x i8> %b, zeroinitializer
567 %xx = zext <8 x i8> %x to <8 x i32>
568 %yy = zext <8 x i16> %y to <8 x i32>
569 %m = mul <8 x i32> %xx, %yy
570 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
571 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
575 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
576 ; CHECK-LABEL: add_v8i8i16_v8i32_sext:
577 ; CHECK: @ %bb.0: @ %entry
578 ; CHECK-NEXT: vmovlb.u8 q2, q2
579 ; CHECK-NEXT: vmovlb.s8 q0, q0
580 ; CHECK-NEXT: vpt.i16 eq, q2, zr
581 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
584 %c = icmp eq <8 x i8> %b, zeroinitializer
585 %xx = sext <8 x i8> %x to <8 x i32>
586 %yy = sext <8 x i16> %y to <8 x i32>
587 %m = mul <8 x i32> %xx, %yy
588 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
589 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
593 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
594 ; CHECK-LABEL: add_v4i8_v4i32_zext:
595 ; CHECK: @ %bb.0: @ %entry
596 ; CHECK-NEXT: vmov.i32 q3, #0xff
597 ; CHECK-NEXT: vand q2, q2, q3
598 ; CHECK-NEXT: vand q1, q1, q3
599 ; CHECK-NEXT: vand q0, q0, q3
600 ; CHECK-NEXT: vpt.i32 eq, q2, zr
601 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
604 %c = icmp eq <4 x i8> %b, zeroinitializer
605 %xx = zext <4 x i8> %x to <4 x i32>
606 %yy = zext <4 x i8> %y to <4 x i32>
607 %m = mul <4 x i32> %xx, %yy
608 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
609 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
613 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
614 ; CHECK-LABEL: add_v4i8_v4i32_sext:
615 ; CHECK: @ %bb.0: @ %entry
616 ; CHECK-NEXT: vmov.i32 q3, #0xff
617 ; CHECK-NEXT: vmovlb.s8 q1, q1
618 ; CHECK-NEXT: vmovlb.s8 q0, q0
619 ; CHECK-NEXT: vand q2, q2, q3
620 ; CHECK-NEXT: vmovlb.s16 q1, q1
621 ; CHECK-NEXT: vmovlb.s16 q0, q0
622 ; CHECK-NEXT: vpt.i32 eq, q2, zr
623 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
626 %c = icmp eq <4 x i8> %b, zeroinitializer
627 %xx = sext <4 x i8> %x to <4 x i32>
628 %yy = sext <4 x i8> %y to <4 x i32>
629 %m = mul <4 x i32> %xx, %yy
630 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
631 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
635 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
636 ; CHECK-LABEL: add_v4i8_v4i32_szext:
637 ; CHECK: @ %bb.0: @ %entry
638 ; CHECK-NEXT: vmov.i32 q3, #0xff
639 ; CHECK-NEXT: vmovlb.s8 q0, q0
640 ; CHECK-NEXT: vand q2, q2, q3
641 ; CHECK-NEXT: vand q1, q1, q3
642 ; CHECK-NEXT: vmovlb.s16 q0, q0
643 ; CHECK-NEXT: vpt.i32 eq, q2, zr
644 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
647 %c = icmp eq <4 x i8> %b, zeroinitializer
648 %xx = sext <4 x i8> %x to <4 x i32>
649 %yy = zext <4 x i8> %y to <4 x i32>
650 %m = mul <4 x i32> %xx, %yy
651 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
652 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
656 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
657 ; CHECK-LABEL: add_v16i8_v16i16_zext:
658 ; CHECK: @ %bb.0: @ %entry
659 ; CHECK-NEXT: vpt.i8 eq, q2, zr
660 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
661 ; CHECK-NEXT: uxth r0, r0
664 %c = icmp eq <16 x i8> %b, zeroinitializer
665 %xx = zext <16 x i8> %x to <16 x i16>
666 %yy = zext <16 x i8> %y to <16 x i16>
667 %m = mul <16 x i16> %xx, %yy
668 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
669 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
673 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
674 ; CHECK-LABEL: add_v16i8_v16i16_sext:
675 ; CHECK: @ %bb.0: @ %entry
676 ; CHECK-NEXT: vpt.i8 eq, q2, zr
677 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
678 ; CHECK-NEXT: sxth r0, r0
681 %c = icmp eq <16 x i8> %b, zeroinitializer
682 %xx = sext <16 x i8> %x to <16 x i16>
683 %yy = sext <16 x i8> %y to <16 x i16>
684 %m = mul <16 x i16> %xx, %yy
685 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
686 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
690 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
691 ; CHECK-LABEL: add_v16i8_v16i16_szext:
692 ; CHECK: @ %bb.0: @ %entry
693 ; CHECK-NEXT: .pad #32
694 ; CHECK-NEXT: sub sp, #32
695 ; CHECK-NEXT: add r0, sp, #16
696 ; CHECK-NEXT: mov r1, sp
697 ; CHECK-NEXT: vstrw.32 q1, [r0]
698 ; CHECK-NEXT: vstrw.32 q0, [r1]
699 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
700 ; CHECK-NEXT: vmov.i8 q0, #0x0
701 ; CHECK-NEXT: vmov.i8 q1, #0xff
702 ; CHECK-NEXT: vldrb.s16 q2, [r1, #8]
703 ; CHECK-NEXT: vpsel q0, q1, q0
704 ; CHECK-NEXT: vmov.u8 r2, q0[8]
705 ; CHECK-NEXT: vmov.u8 r3, q0[0]
706 ; CHECK-NEXT: vmov.16 q1[0], r2
707 ; CHECK-NEXT: vmov.u8 r2, q0[9]
708 ; CHECK-NEXT: vmov.16 q1[1], r2
709 ; CHECK-NEXT: vmov.u8 r2, q0[10]
710 ; CHECK-NEXT: vmov.16 q1[2], r2
711 ; CHECK-NEXT: vmov.u8 r2, q0[11]
712 ; CHECK-NEXT: vmov.16 q1[3], r2
713 ; CHECK-NEXT: vmov.u8 r2, q0[12]
714 ; CHECK-NEXT: vmov.16 q1[4], r2
715 ; CHECK-NEXT: vmov.u8 r2, q0[13]
716 ; CHECK-NEXT: vmov.16 q1[5], r2
717 ; CHECK-NEXT: vmov.u8 r2, q0[14]
718 ; CHECK-NEXT: vmov.16 q1[6], r2
719 ; CHECK-NEXT: vmov.u8 r2, q0[15]
720 ; CHECK-NEXT: vmov.16 q1[7], r2
721 ; CHECK-NEXT: vcmp.i16 ne, q1, zr
722 ; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
724 ; CHECK-NEXT: vmlavt.u16 r2, q2, q1
725 ; CHECK-NEXT: vmov.16 q1[0], r3
726 ; CHECK-NEXT: vmov.u8 r3, q0[1]
727 ; CHECK-NEXT: vmov.16 q1[1], r3
728 ; CHECK-NEXT: vmov.u8 r3, q0[2]
729 ; CHECK-NEXT: vmov.16 q1[2], r3
730 ; CHECK-NEXT: vmov.u8 r3, q0[3]
731 ; CHECK-NEXT: vmov.16 q1[3], r3
732 ; CHECK-NEXT: vmov.u8 r3, q0[4]
733 ; CHECK-NEXT: vmov.16 q1[4], r3
734 ; CHECK-NEXT: vmov.u8 r3, q0[5]
735 ; CHECK-NEXT: vmov.16 q1[5], r3
736 ; CHECK-NEXT: vmov.u8 r3, q0[6]
737 ; CHECK-NEXT: vmov.16 q1[6], r3
738 ; CHECK-NEXT: vmov.u8 r3, q0[7]
739 ; CHECK-NEXT: vmov.16 q1[7], r3
740 ; CHECK-NEXT: vldrb.u16 q0, [r0]
741 ; CHECK-NEXT: vcmp.i16 ne, q1, zr
742 ; CHECK-NEXT: vldrb.s16 q1, [r1]
744 ; CHECK-NEXT: vmlavat.u16 r2, q1, q0
745 ; CHECK-NEXT: sxth r0, r2
746 ; CHECK-NEXT: add sp, #32
749 %c = icmp eq <16 x i8> %b, zeroinitializer
750 %xx = sext <16 x i8> %x to <16 x i16>
751 %yy = zext <16 x i8> %y to <16 x i16>
752 %m = mul <16 x i16> %xx, %yy
753 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
754 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
758 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
759 ; CHECK-LABEL: add_v8i8_v8i16_zext:
760 ; CHECK: @ %bb.0: @ %entry
761 ; CHECK-NEXT: vmovlb.u8 q2, q2
762 ; CHECK-NEXT: vmovlb.u8 q1, q1
763 ; CHECK-NEXT: vmovlb.u8 q0, q0
764 ; CHECK-NEXT: vpt.i16 eq, q2, zr
765 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
766 ; CHECK-NEXT: uxth r0, r0
769 %c = icmp eq <8 x i8> %b, zeroinitializer
770 %xx = zext <8 x i8> %x to <8 x i16>
771 %yy = zext <8 x i8> %y to <8 x i16>
772 %m = mul <8 x i16> %xx, %yy
773 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
774 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
778 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
779 ; CHECK-LABEL: add_v8i8_v8i16_sext:
780 ; CHECK: @ %bb.0: @ %entry
781 ; CHECK-NEXT: vmovlb.u8 q2, q2
782 ; CHECK-NEXT: vmovlb.s8 q1, q1
783 ; CHECK-NEXT: vmovlb.s8 q0, q0
784 ; CHECK-NEXT: vpt.i16 eq, q2, zr
785 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
786 ; CHECK-NEXT: sxth r0, r0
789 %c = icmp eq <8 x i8> %b, zeroinitializer
790 %xx = sext <8 x i8> %x to <8 x i16>
791 %yy = sext <8 x i8> %y to <8 x i16>
792 %m = mul <8 x i16> %xx, %yy
793 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
794 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
798 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
799 ; CHECK-LABEL: add_v16i8_v16i8:
800 ; CHECK: @ %bb.0: @ %entry
801 ; CHECK-NEXT: vpt.i8 eq, q2, zr
802 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
803 ; CHECK-NEXT: uxtb r0, r0
806 %c = icmp eq <16 x i8> %b, zeroinitializer
807 %m = mul <16 x i8> %x, %y
808 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
809 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
813 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
814 ; CHECK-LABEL: add_v16i8_v16i64_zext:
815 ; CHECK: @ %bb.0: @ %entry
816 ; CHECK-NEXT: .save {r7, lr}
817 ; CHECK-NEXT: push {r7, lr}
818 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
819 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
820 ; CHECK-NEXT: .pad #32
821 ; CHECK-NEXT: sub sp, #32
822 ; CHECK-NEXT: vmov q3, q0
823 ; CHECK-NEXT: vmov.i8 q0, #0x0
824 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
825 ; CHECK-NEXT: vmov.i8 q2, #0xff
826 ; CHECK-NEXT: vpsel q6, q2, q0
827 ; CHECK-NEXT: vmov q4, q0
828 ; CHECK-NEXT: vmov.u8 r0, q6[0]
829 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
830 ; CHECK-NEXT: vmov.16 q0[0], r0
831 ; CHECK-NEXT: vmov.u8 r0, q6[1]
832 ; CHECK-NEXT: vmov.16 q0[1], r0
833 ; CHECK-NEXT: vmov.u8 r0, q6[2]
834 ; CHECK-NEXT: vmov.16 q0[2], r0
835 ; CHECK-NEXT: vmov.u8 r0, q6[3]
836 ; CHECK-NEXT: vmov.16 q0[3], r0
837 ; CHECK-NEXT: vmov.u8 r0, q6[4]
838 ; CHECK-NEXT: vmov.16 q0[4], r0
839 ; CHECK-NEXT: vmov.u8 r0, q6[5]
840 ; CHECK-NEXT: vmov.16 q0[5], r0
841 ; CHECK-NEXT: vmov.u8 r0, q6[6]
842 ; CHECK-NEXT: vmov.16 q0[6], r0
843 ; CHECK-NEXT: vmov.u8 r0, q6[7]
844 ; CHECK-NEXT: vmov.16 q0[7], r0
845 ; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
846 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
847 ; CHECK-NEXT: vmov.u8 r2, q3[0]
848 ; CHECK-NEXT: vpsel q7, q2, q4
849 ; CHECK-NEXT: vmov.u16 r0, q7[2]
850 ; CHECK-NEXT: vmov.u16 r1, q7[0]
851 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
852 ; CHECK-NEXT: vmov.u16 r0, q7[3]
853 ; CHECK-NEXT: vmov.u16 r1, q7[1]
854 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
855 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
856 ; CHECK-NEXT: vpsel q0, q2, q4
857 ; CHECK-NEXT: vmov r0, r1, d0
858 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r1
859 ; CHECK-NEXT: vmov q2[3], q2[1], r0, r1
860 ; CHECK-NEXT: vmov.u8 r0, q1[1]
861 ; CHECK-NEXT: vmov.u8 r1, q1[0]
862 ; CHECK-NEXT: vcmp.i32 ne, q2, zr
863 ; CHECK-NEXT: vmov q5[2], q5[0], r1, r0
864 ; CHECK-NEXT: vmov.u8 r1, q3[1]
865 ; CHECK-NEXT: vmov.i64 q2, #0xff
866 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
867 ; CHECK-NEXT: vand q5, q5, q2
868 ; CHECK-NEXT: vand q4, q4, q2
869 ; CHECK-NEXT: vmov r0, s22
870 ; CHECK-NEXT: vmov r1, s18
871 ; CHECK-NEXT: vmov r2, s20
872 ; CHECK-NEXT: vmov.i32 q5, #0x0
873 ; CHECK-NEXT: vmov r3, s16
874 ; CHECK-NEXT: umull r0, r1, r1, r0
875 ; CHECK-NEXT: umull r2, r3, r3, r2
876 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r0
877 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r1
878 ; CHECK-NEXT: vpsel q4, q4, q5
879 ; CHECK-NEXT: vmov r0, r1, d9
880 ; CHECK-NEXT: vmov r2, r3, d8
881 ; CHECK-NEXT: adds.w r12, r2, r0
882 ; CHECK-NEXT: vmov.u8 r0, q3[2]
883 ; CHECK-NEXT: adc.w lr, r3, r1
884 ; CHECK-NEXT: vmov r2, r3, d1
885 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
886 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
887 ; CHECK-NEXT: vmov.u8 r2, q1[3]
888 ; CHECK-NEXT: vmov.u8 r3, q1[2]
889 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
890 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
891 ; CHECK-NEXT: vmov.u8 r3, q3[3]
892 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3
893 ; CHECK-NEXT: vand q0, q0, q2
894 ; CHECK-NEXT: vand q4, q4, q2
895 ; CHECK-NEXT: vmov r2, s2
896 ; CHECK-NEXT: vmov r0, s18
897 ; CHECK-NEXT: vmov r1, s16
898 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
899 ; CHECK-NEXT: vmov r3, s0
900 ; CHECK-NEXT: umull r0, r2, r0, r2
901 ; CHECK-NEXT: umull r1, r3, r1, r3
902 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
903 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
904 ; CHECK-NEXT: vpsel q0, q0, q5
905 ; CHECK-NEXT: vmov r0, r1, d0
906 ; CHECK-NEXT: vmov r2, r3, d1
907 ; CHECK-NEXT: adds.w r0, r0, r12
908 ; CHECK-NEXT: adc.w r1, r1, lr
909 ; CHECK-NEXT: adds.w r12, r0, r2
910 ; CHECK-NEXT: adc.w lr, r1, r3
911 ; CHECK-NEXT: vmov.u16 r2, q7[6]
912 ; CHECK-NEXT: vmov.u16 r3, q7[4]
913 ; CHECK-NEXT: vmov.u8 r0, q3[4]
914 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
915 ; CHECK-NEXT: vmov.u16 r2, q7[7]
916 ; CHECK-NEXT: vmov.u16 r3, q7[5]
917 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
918 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
919 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
920 ; CHECK-NEXT: vpsel q0, q0, q4
921 ; CHECK-NEXT: vmov r2, r3, d0
922 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
923 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
924 ; CHECK-NEXT: vmov.u8 r2, q1[5]
925 ; CHECK-NEXT: vmov.u8 r3, q1[4]
926 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
927 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
928 ; CHECK-NEXT: vmov.u8 r3, q3[5]
929 ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3
930 ; CHECK-NEXT: vand q4, q4, q2
931 ; CHECK-NEXT: vand q7, q7, q2
932 ; CHECK-NEXT: vmov r2, s18
933 ; CHECK-NEXT: vmov r0, s30
934 ; CHECK-NEXT: vmov r1, s28
935 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
936 ; CHECK-NEXT: vmov r3, s16
937 ; CHECK-NEXT: umull r0, r2, r0, r2
938 ; CHECK-NEXT: umull r1, r3, r1, r3
939 ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
940 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r2
941 ; CHECK-NEXT: vpsel q4, q4, q5
942 ; CHECK-NEXT: vmov r0, r1, d8
943 ; CHECK-NEXT: vmov r2, r3, d9
944 ; CHECK-NEXT: adds.w r0, r0, r12
945 ; CHECK-NEXT: adc.w r1, r1, lr
946 ; CHECK-NEXT: adds.w r12, r0, r2
947 ; CHECK-NEXT: adc.w lr, r1, r3
948 ; CHECK-NEXT: vmov r2, r3, d1
949 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
950 ; CHECK-NEXT: vmov.u8 r0, q3[6]
951 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
952 ; CHECK-NEXT: vmov.u8 r2, q1[7]
953 ; CHECK-NEXT: vmov.u8 r3, q1[6]
954 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
955 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
956 ; CHECK-NEXT: vmov.u8 r3, q3[7]
957 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3
958 ; CHECK-NEXT: vand q0, q0, q2
959 ; CHECK-NEXT: vand q4, q4, q2
960 ; CHECK-NEXT: vmov r2, s2
961 ; CHECK-NEXT: vmov r0, s18
962 ; CHECK-NEXT: vmov r1, s16
963 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
964 ; CHECK-NEXT: vmov r3, s0
965 ; CHECK-NEXT: umull r0, r2, r0, r2
966 ; CHECK-NEXT: umull r1, r3, r1, r3
967 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
968 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
969 ; CHECK-NEXT: vpsel q0, q0, q5
970 ; CHECK-NEXT: vmov r0, r1, d0
971 ; CHECK-NEXT: vmov r2, r3, d1
972 ; CHECK-NEXT: adds.w r0, r0, r12
973 ; CHECK-NEXT: adc.w r1, r1, lr
974 ; CHECK-NEXT: adds.w r12, r0, r2
975 ; CHECK-NEXT: vmov.u8 r2, q6[8]
976 ; CHECK-NEXT: adc.w lr, r1, r3
977 ; CHECK-NEXT: vmov.16 q0[0], r2
978 ; CHECK-NEXT: vmov.u8 r2, q6[9]
979 ; CHECK-NEXT: vmov.16 q0[1], r2
980 ; CHECK-NEXT: vmov.u8 r2, q6[10]
981 ; CHECK-NEXT: vmov.16 q0[2], r2
982 ; CHECK-NEXT: vmov.u8 r2, q6[11]
983 ; CHECK-NEXT: vmov.16 q0[3], r2
984 ; CHECK-NEXT: vmov.u8 r2, q6[12]
985 ; CHECK-NEXT: vmov.16 q0[4], r2
986 ; CHECK-NEXT: vmov.u8 r2, q6[13]
987 ; CHECK-NEXT: vmov.16 q0[5], r2
988 ; CHECK-NEXT: vmov.u8 r2, q6[14]
989 ; CHECK-NEXT: vmov.16 q0[6], r2
990 ; CHECK-NEXT: vmov.u8 r2, q6[15]
991 ; CHECK-NEXT: vmov.16 q0[7], r2
992 ; CHECK-NEXT: vmov.u8 r0, q3[8]
993 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
994 ; CHECK-NEXT: vpsel q6, q7, q4
995 ; CHECK-NEXT: vmov.u16 r2, q6[2]
996 ; CHECK-NEXT: vmov.u16 r3, q6[0]
997 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
998 ; CHECK-NEXT: vmov.u16 r2, q6[3]
999 ; CHECK-NEXT: vmov.u16 r3, q6[1]
1000 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1001 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1002 ; CHECK-NEXT: vpsel q0, q7, q4
1003 ; CHECK-NEXT: vmov r2, r3, d0
1004 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
1005 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
1006 ; CHECK-NEXT: vmov.u8 r2, q1[9]
1007 ; CHECK-NEXT: vmov.u8 r3, q1[8]
1008 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
1009 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
1010 ; CHECK-NEXT: vmov.u8 r3, q3[9]
1011 ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3
1012 ; CHECK-NEXT: vand q4, q4, q2
1013 ; CHECK-NEXT: vand q7, q7, q2
1014 ; CHECK-NEXT: vmov r2, s18
1015 ; CHECK-NEXT: vmov r0, s30
1016 ; CHECK-NEXT: vmov r3, s16
1017 ; CHECK-NEXT: vmov r1, s28
1018 ; CHECK-NEXT: umull r0, r2, r0, r2
1019 ; CHECK-NEXT: umull r1, r3, r1, r3
1020 ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
1021 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r2
1022 ; CHECK-NEXT: vpsel q4, q4, q5
1023 ; CHECK-NEXT: vmov r0, r1, d8
1024 ; CHECK-NEXT: vmov r2, r3, d9
1025 ; CHECK-NEXT: adds.w r0, r0, r12
1026 ; CHECK-NEXT: adc.w r1, r1, lr
1027 ; CHECK-NEXT: adds.w r12, r0, r2
1028 ; CHECK-NEXT: adc.w lr, r1, r3
1029 ; CHECK-NEXT: vmov r2, r3, d1
1030 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1031 ; CHECK-NEXT: vmov.u8 r0, q3[10]
1032 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1033 ; CHECK-NEXT: vmov.u8 r2, q1[11]
1034 ; CHECK-NEXT: vmov.u8 r3, q1[10]
1035 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1036 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1037 ; CHECK-NEXT: vmov.u8 r3, q3[11]
1038 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3
1039 ; CHECK-NEXT: vand q0, q0, q2
1040 ; CHECK-NEXT: vand q4, q4, q2
1041 ; CHECK-NEXT: vmov r2, s2
1042 ; CHECK-NEXT: vmov r0, s18
1043 ; CHECK-NEXT: vmov r1, s16
1044 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
1045 ; CHECK-NEXT: vmov r3, s0
1046 ; CHECK-NEXT: umull r0, r2, r0, r2
1047 ; CHECK-NEXT: umull r1, r3, r1, r3
1048 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
1049 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1050 ; CHECK-NEXT: vpsel q0, q0, q5
1051 ; CHECK-NEXT: vmov r0, r1, d0
1052 ; CHECK-NEXT: vmov r2, r3, d1
1053 ; CHECK-NEXT: adds.w r0, r0, r12
1054 ; CHECK-NEXT: adc.w r1, r1, lr
1055 ; CHECK-NEXT: adds.w r12, r0, r2
1056 ; CHECK-NEXT: adc.w lr, r1, r3
1057 ; CHECK-NEXT: vmov.u16 r2, q6[6]
1058 ; CHECK-NEXT: vmov.u16 r3, q6[4]
1059 ; CHECK-NEXT: vmov.u8 r0, q3[12]
1060 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1061 ; CHECK-NEXT: vmov.u16 r2, q6[7]
1062 ; CHECK-NEXT: vmov.u16 r3, q6[5]
1063 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1064 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1065 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
1066 ; CHECK-NEXT: vpsel q0, q0, q4
1067 ; CHECK-NEXT: vmov r2, r3, d0
1068 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
1069 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
1070 ; CHECK-NEXT: vmov.u8 r2, q1[13]
1071 ; CHECK-NEXT: vmov.u8 r3, q1[12]
1072 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
1073 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
1074 ; CHECK-NEXT: vmov.u8 r3, q3[13]
1075 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r3
1076 ; CHECK-NEXT: vand q4, q4, q2
1077 ; CHECK-NEXT: vand q6, q6, q2
1078 ; CHECK-NEXT: vmov r2, s18
1079 ; CHECK-NEXT: vmov r0, s26
1080 ; CHECK-NEXT: vmov r3, s16
1081 ; CHECK-NEXT: vmov r1, s24
1082 ; CHECK-NEXT: umull r0, r2, r0, r2
1083 ; CHECK-NEXT: umull r1, r3, r1, r3
1084 ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
1085 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r2
1086 ; CHECK-NEXT: vpsel q4, q4, q5
1087 ; CHECK-NEXT: vmov r0, r1, d8
1088 ; CHECK-NEXT: vmov r2, r3, d9
1089 ; CHECK-NEXT: adds.w r0, r0, r12
1090 ; CHECK-NEXT: adc.w r1, r1, lr
1091 ; CHECK-NEXT: adds.w r12, r0, r2
1092 ; CHECK-NEXT: adc.w lr, r1, r3
1093 ; CHECK-NEXT: vmov r2, r3, d1
1094 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1095 ; CHECK-NEXT: vmov.u8 r0, q3[14]
1096 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1097 ; CHECK-NEXT: vmov.u8 r2, q1[15]
1098 ; CHECK-NEXT: vmov.u8 r3, q1[14]
1099 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1100 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1101 ; CHECK-NEXT: vmov.u8 r3, q3[15]
1102 ; CHECK-NEXT: vmov q1[2], q1[0], r0, r3
1103 ; CHECK-NEXT: vand q0, q0, q2
1104 ; CHECK-NEXT: vand q1, q1, q2
1105 ; CHECK-NEXT: vmov r2, s2
1106 ; CHECK-NEXT: vmov r0, s6
1107 ; CHECK-NEXT: vmov r3, s0
1108 ; CHECK-NEXT: vmov r1, s4
1109 ; CHECK-NEXT: umull r0, r2, r0, r2
1110 ; CHECK-NEXT: umull r1, r3, r1, r3
1111 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
1112 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1113 ; CHECK-NEXT: vpsel q0, q0, q5
1114 ; CHECK-NEXT: vmov r0, r1, d0
1115 ; CHECK-NEXT: vmov r2, r3, d1
1116 ; CHECK-NEXT: adds.w r0, r0, r12
1117 ; CHECK-NEXT: adc.w r1, r1, lr
1118 ; CHECK-NEXT: adds r0, r0, r2
1119 ; CHECK-NEXT: adcs r1, r3
1120 ; CHECK-NEXT: add sp, #32
1121 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1122 ; CHECK-NEXT: pop {r7, pc}
1124 %c = icmp eq <16 x i8> %b, zeroinitializer
1125 %xx = zext <16 x i8> %x to <16 x i64>
1126 %yy = zext <16 x i8> %y to <16 x i64>
1127 %m = mul <16 x i64> %xx, %yy
1128 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1129 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1133 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
1134 ; CHECK-LABEL: add_v16i8_v16i64_sext:
1135 ; CHECK: @ %bb.0: @ %entry
1136 ; CHECK-NEXT: .save {r7, lr}
1137 ; CHECK-NEXT: push {r7, lr}
1138 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1139 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1140 ; CHECK-NEXT: .pad #16
1141 ; CHECK-NEXT: sub sp, #16
1142 ; CHECK-NEXT: vmov q3, q0
1143 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
1144 ; CHECK-NEXT: vmov.i8 q0, #0x0
1145 ; CHECK-NEXT: vmov.i8 q2, #0xff
1146 ; CHECK-NEXT: vpsel q5, q2, q0
1147 ; CHECK-NEXT: vmov.s8 r2, q1[0]
1148 ; CHECK-NEXT: vmov.u8 r0, q5[0]
1149 ; CHECK-NEXT: vmov.s8 r3, q3[0]
1150 ; CHECK-NEXT: vmov.16 q4[0], r0
1151 ; CHECK-NEXT: vmov.u8 r0, q5[1]
1152 ; CHECK-NEXT: vmov.16 q4[1], r0
1153 ; CHECK-NEXT: vmov.u8 r0, q5[2]
1154 ; CHECK-NEXT: vmov.16 q4[2], r0
1155 ; CHECK-NEXT: vmov.u8 r0, q5[3]
1156 ; CHECK-NEXT: vmov.16 q4[3], r0
1157 ; CHECK-NEXT: vmov.u8 r0, q5[4]
1158 ; CHECK-NEXT: vmov.16 q4[4], r0
1159 ; CHECK-NEXT: vmov.u8 r0, q5[5]
1160 ; CHECK-NEXT: vmov.16 q4[5], r0
1161 ; CHECK-NEXT: vmov.u8 r0, q5[6]
1162 ; CHECK-NEXT: vmov.16 q4[6], r0
1163 ; CHECK-NEXT: vmov.u8 r0, q5[7]
1164 ; CHECK-NEXT: vmov.16 q4[7], r0
1165 ; CHECK-NEXT: smull r2, r3, r3, r2
1166 ; CHECK-NEXT: vcmp.i16 ne, q4, zr
1167 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
1168 ; CHECK-NEXT: vpsel q6, q2, q0
1169 ; CHECK-NEXT: vmov.u16 r0, q6[2]
1170 ; CHECK-NEXT: vmov.u16 r1, q6[0]
1171 ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
1172 ; CHECK-NEXT: vmov.u16 r0, q6[3]
1173 ; CHECK-NEXT: vmov.u16 r1, q6[1]
1174 ; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
1175 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
1176 ; CHECK-NEXT: vpsel q7, q2, q0
1177 ; CHECK-NEXT: vmov r0, r1, d14
1178 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r1
1179 ; CHECK-NEXT: vmov q4[3], q4[1], r0, r1
1180 ; CHECK-NEXT: vmov.s8 r0, q1[1]
1181 ; CHECK-NEXT: vmov.s8 r1, q3[1]
1182 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
1183 ; CHECK-NEXT: smull r0, r1, r1, r0
1184 ; CHECK-NEXT: vmov.i32 q4, #0x0
1185 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
1186 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
1187 ; CHECK-NEXT: vpsel q0, q0, q4
1188 ; CHECK-NEXT: vmov r0, r1, d1
1189 ; CHECK-NEXT: vmov r2, r3, d0
1190 ; CHECK-NEXT: adds.w r12, r2, r0
1191 ; CHECK-NEXT: vmov.s8 r0, q1[2]
1192 ; CHECK-NEXT: adc.w lr, r3, r1
1193 ; CHECK-NEXT: vmov r2, r3, d15
1194 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1195 ; CHECK-NEXT: vmov.s8 r1, q3[2]
1196 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1197 ; CHECK-NEXT: vmov.s8 r2, q1[3]
1198 ; CHECK-NEXT: vmov.s8 r3, q3[3]
1199 ; CHECK-NEXT: smull r0, r1, r1, r0
1200 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1201 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
1202 ; CHECK-NEXT: smull r2, r3, r3, r2
1203 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1204 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1205 ; CHECK-NEXT: vpsel q0, q0, q4
1206 ; CHECK-NEXT: vmov r0, r1, d0
1207 ; CHECK-NEXT: vmov r2, r3, d1
1208 ; CHECK-NEXT: adds.w r0, r0, r12
1209 ; CHECK-NEXT: adc.w r1, r1, lr
1210 ; CHECK-NEXT: adds.w r12, r0, r2
1211 ; CHECK-NEXT: adc.w lr, r1, r3
1212 ; CHECK-NEXT: vmov.u16 r2, q6[6]
1213 ; CHECK-NEXT: vmov.u16 r3, q6[4]
1214 ; CHECK-NEXT: vmov.s8 r0, q1[4]
1215 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1216 ; CHECK-NEXT: vmov.u16 r2, q6[7]
1217 ; CHECK-NEXT: vmov.u16 r3, q6[5]
1218 ; CHECK-NEXT: vmov.s8 r1, q3[4]
1219 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1220 ; CHECK-NEXT: smull r0, r1, r1, r0
1221 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1222 ; CHECK-NEXT: vpsel q6, q2, q7
1223 ; CHECK-NEXT: vmov r2, r3, d12
1224 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1225 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1226 ; CHECK-NEXT: vmov.s8 r2, q1[5]
1227 ; CHECK-NEXT: vmov.s8 r3, q3[5]
1228 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1229 ; CHECK-NEXT: smull r2, r3, r3, r2
1230 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1231 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1232 ; CHECK-NEXT: vpsel q0, q0, q4
1233 ; CHECK-NEXT: vmov r0, r1, d0
1234 ; CHECK-NEXT: vmov r2, r3, d1
1235 ; CHECK-NEXT: adds.w r0, r0, r12
1236 ; CHECK-NEXT: adc.w r1, r1, lr
1237 ; CHECK-NEXT: adds.w r12, r0, r2
1238 ; CHECK-NEXT: adc.w lr, r1, r3
1239 ; CHECK-NEXT: vmov r2, r3, d13
1240 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1241 ; CHECK-NEXT: vmov.s8 r0, q1[6]
1242 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1243 ; CHECK-NEXT: vmov.s8 r1, q3[6]
1244 ; CHECK-NEXT: vmov.s8 r2, q1[7]
1245 ; CHECK-NEXT: vmov.s8 r3, q3[7]
1246 ; CHECK-NEXT: smull r2, r3, r3, r2
1247 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1248 ; CHECK-NEXT: smull r0, r1, r1, r0
1249 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1250 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1251 ; CHECK-NEXT: vpsel q0, q0, q4
1252 ; CHECK-NEXT: vmov r0, r1, d0
1253 ; CHECK-NEXT: vmov r2, r3, d1
1254 ; CHECK-NEXT: adds.w r0, r0, r12
1255 ; CHECK-NEXT: adc.w r1, r1, lr
1256 ; CHECK-NEXT: adds.w r12, r0, r2
1257 ; CHECK-NEXT: vmov.u8 r2, q5[8]
1258 ; CHECK-NEXT: adc.w lr, r1, r3
1259 ; CHECK-NEXT: vmov.16 q6[0], r2
1260 ; CHECK-NEXT: vmov.u8 r2, q5[9]
1261 ; CHECK-NEXT: vmov.16 q6[1], r2
1262 ; CHECK-NEXT: vmov.u8 r2, q5[10]
1263 ; CHECK-NEXT: vmov.16 q6[2], r2
1264 ; CHECK-NEXT: vmov.u8 r2, q5[11]
1265 ; CHECK-NEXT: vmov.16 q6[3], r2
1266 ; CHECK-NEXT: vmov.u8 r2, q5[12]
1267 ; CHECK-NEXT: vmov.16 q6[4], r2
1268 ; CHECK-NEXT: vmov.u8 r2, q5[13]
1269 ; CHECK-NEXT: vmov.16 q6[5], r2
1270 ; CHECK-NEXT: vmov.u8 r2, q5[14]
1271 ; CHECK-NEXT: vmov.16 q6[6], r2
1272 ; CHECK-NEXT: vmov.u8 r2, q5[15]
1273 ; CHECK-NEXT: vmov.16 q6[7], r2
1274 ; CHECK-NEXT: vmov.s8 r0, q1[8]
1275 ; CHECK-NEXT: vcmp.i16 ne, q6, zr
1276 ; CHECK-NEXT: vmov.s8 r1, q3[8]
1277 ; CHECK-NEXT: vpsel q5, q2, q7
1278 ; CHECK-NEXT: smull r0, r1, r1, r0
1279 ; CHECK-NEXT: vmov.u16 r2, q5[2]
1280 ; CHECK-NEXT: vmov.u16 r3, q5[0]
1281 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1282 ; CHECK-NEXT: vmov.u16 r2, q5[3]
1283 ; CHECK-NEXT: vmov.u16 r3, q5[1]
1284 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1285 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1286 ; CHECK-NEXT: vpsel q6, q2, q7
1287 ; CHECK-NEXT: vmov r2, r3, d12
1288 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1289 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1290 ; CHECK-NEXT: vmov.s8 r2, q1[9]
1291 ; CHECK-NEXT: vmov.s8 r3, q3[9]
1292 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1293 ; CHECK-NEXT: smull r2, r3, r3, r2
1294 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1295 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1296 ; CHECK-NEXT: vpsel q0, q0, q4
1297 ; CHECK-NEXT: vmov r0, r1, d0
1298 ; CHECK-NEXT: vmov r2, r3, d1
1299 ; CHECK-NEXT: adds.w r0, r0, r12
1300 ; CHECK-NEXT: adc.w r1, r1, lr
1301 ; CHECK-NEXT: adds.w r12, r0, r2
1302 ; CHECK-NEXT: adc.w lr, r1, r3
1303 ; CHECK-NEXT: vmov r2, r3, d13
1304 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1305 ; CHECK-NEXT: vmov.s8 r0, q1[10]
1306 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1307 ; CHECK-NEXT: vmov.s8 r1, q3[10]
1308 ; CHECK-NEXT: vmov.s8 r2, q1[11]
1309 ; CHECK-NEXT: vmov.s8 r3, q3[11]
1310 ; CHECK-NEXT: smull r2, r3, r3, r2
1311 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1312 ; CHECK-NEXT: smull r0, r1, r1, r0
1313 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1314 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1315 ; CHECK-NEXT: vpsel q0, q0, q4
1316 ; CHECK-NEXT: vmov r0, r1, d0
1317 ; CHECK-NEXT: vmov r2, r3, d1
1318 ; CHECK-NEXT: adds.w r0, r0, r12
1319 ; CHECK-NEXT: adc.w r1, r1, lr
1320 ; CHECK-NEXT: adds.w r12, r0, r2
1321 ; CHECK-NEXT: adc.w lr, r1, r3
1322 ; CHECK-NEXT: vmov.u16 r2, q5[6]
1323 ; CHECK-NEXT: vmov.u16 r3, q5[4]
1324 ; CHECK-NEXT: vmov.s8 r0, q1[12]
1325 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1326 ; CHECK-NEXT: vmov.u16 r2, q5[7]
1327 ; CHECK-NEXT: vmov.u16 r3, q5[5]
1328 ; CHECK-NEXT: vmov.s8 r1, q3[12]
1329 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1330 ; CHECK-NEXT: smull r0, r1, r1, r0
1331 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1332 ; CHECK-NEXT: vpsel q2, q2, q7
1333 ; CHECK-NEXT: vmov r2, r3, d4
1334 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1335 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1336 ; CHECK-NEXT: vmov.s8 r2, q1[13]
1337 ; CHECK-NEXT: vmov.s8 r3, q3[13]
1338 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1339 ; CHECK-NEXT: smull r2, r3, r3, r2
1340 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1341 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1342 ; CHECK-NEXT: vpsel q0, q0, q4
1343 ; CHECK-NEXT: vmov r0, r1, d0
1344 ; CHECK-NEXT: vmov r2, r3, d1
1345 ; CHECK-NEXT: adds.w r0, r0, r12
1346 ; CHECK-NEXT: adc.w r1, r1, lr
1347 ; CHECK-NEXT: adds.w r12, r0, r2
1348 ; CHECK-NEXT: adc.w lr, r1, r3
1349 ; CHECK-NEXT: vmov r2, r3, d5
1350 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
1351 ; CHECK-NEXT: vmov.s8 r0, q1[14]
1352 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
1353 ; CHECK-NEXT: vmov.s8 r1, q3[14]
1354 ; CHECK-NEXT: vmov.s8 r2, q1[15]
1355 ; CHECK-NEXT: vmov.s8 r3, q3[15]
1356 ; CHECK-NEXT: smull r2, r3, r3, r2
1357 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1358 ; CHECK-NEXT: smull r0, r1, r1, r0
1359 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1360 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
1361 ; CHECK-NEXT: vpsel q0, q0, q4
1362 ; CHECK-NEXT: vmov r0, r1, d0
1363 ; CHECK-NEXT: vmov r2, r3, d1
1364 ; CHECK-NEXT: adds.w r0, r0, r12
1365 ; CHECK-NEXT: adc.w r1, r1, lr
1366 ; CHECK-NEXT: adds r0, r0, r2
1367 ; CHECK-NEXT: adcs r1, r3
1368 ; CHECK-NEXT: add sp, #16
1369 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1370 ; CHECK-NEXT: pop {r7, pc}
1372 %c = icmp eq <16 x i8> %b, zeroinitializer
1373 %xx = sext <16 x i8> %x to <16 x i64>
1374 %yy = sext <16 x i8> %y to <16 x i64>
1375 %m = mul <16 x i64> %xx, %yy
1376 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1377 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1381 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1382 ; CHECK-LABEL: add_v8i8_v8i64_zext:
1383 ; CHECK: @ %bb.0: @ %entry
1384 ; CHECK-NEXT: vmovlb.u8 q2, q2
1385 ; CHECK-NEXT: vmovlb.u8 q1, q1
1386 ; CHECK-NEXT: vmovlb.u8 q0, q0
1387 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1388 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
1391 %c = icmp eq <8 x i8> %b, zeroinitializer
1392 %xx = zext <8 x i8> %x to <8 x i64>
1393 %yy = zext <8 x i8> %y to <8 x i64>
1394 %m = mul <8 x i64> %xx, %yy
1395 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1396 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1400 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1401 ; CHECK-LABEL: add_v8i8_v8i64_sext:
1402 ; CHECK: @ %bb.0: @ %entry
1403 ; CHECK-NEXT: vmovlb.u8 q2, q2
1404 ; CHECK-NEXT: vmovlb.s8 q1, q1
1405 ; CHECK-NEXT: vmovlb.s8 q0, q0
1406 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1407 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
1410 %c = icmp eq <8 x i8> %b, zeroinitializer
1411 %xx = sext <8 x i8> %x to <8 x i64>
1412 %yy = sext <8 x i8> %y to <8 x i64>
1413 %m = mul <8 x i64> %xx, %yy
1414 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1415 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1419 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1420 ; CHECK-LABEL: add_v4i8_v4i64_zext:
1421 ; CHECK: @ %bb.0: @ %entry
1422 ; CHECK-NEXT: vmov.i32 q3, #0xff
1423 ; CHECK-NEXT: vand q2, q2, q3
1424 ; CHECK-NEXT: vand q1, q1, q3
1425 ; CHECK-NEXT: vand q0, q0, q3
1426 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1427 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1430 %c = icmp eq <4 x i8> %b, zeroinitializer
1431 %xx = zext <4 x i8> %x to <4 x i64>
1432 %yy = zext <4 x i8> %y to <4 x i64>
1433 %m = mul <4 x i64> %xx, %yy
1434 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1435 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1439 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1440 ; CHECK-LABEL: add_v4i8_v4i64_sext:
1441 ; CHECK: @ %bb.0: @ %entry
1442 ; CHECK-NEXT: vmov.i32 q3, #0xff
1443 ; CHECK-NEXT: vmovlb.s8 q1, q1
1444 ; CHECK-NEXT: vmovlb.s8 q0, q0
1445 ; CHECK-NEXT: vand q2, q2, q3
1446 ; CHECK-NEXT: vmovlb.s16 q1, q1
1447 ; CHECK-NEXT: vmovlb.s16 q0, q0
1448 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1449 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1452 %c = icmp eq <4 x i8> %b, zeroinitializer
1453 %xx = sext <4 x i8> %x to <4 x i64>
1454 %yy = sext <4 x i8> %y to <4 x i64>
1455 %m = mul <4 x i64> %xx, %yy
1456 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1457 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1461 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1462 ; CHECK-LABEL: add_v4i8i16_v4i64_zext:
1463 ; CHECK: @ %bb.0: @ %entry
1464 ; CHECK-NEXT: vmov.i32 q3, #0xff
1465 ; CHECK-NEXT: vmovlb.u16 q1, q1
1466 ; CHECK-NEXT: vand q2, q2, q3
1467 ; CHECK-NEXT: vand q0, q0, q3
1468 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1469 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1472 %c = icmp eq <4 x i8> %b, zeroinitializer
1473 %xx = zext <4 x i8> %x to <4 x i64>
1474 %yy = zext <4 x i16> %y to <4 x i64>
1475 %m = mul <4 x i64> %xx, %yy
1476 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1477 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1481 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1482 ; CHECK-LABEL: add_v4i8i16_v4i64_sext:
1483 ; CHECK: @ %bb.0: @ %entry
1484 ; CHECK-NEXT: vmov.i32 q3, #0xff
1485 ; CHECK-NEXT: vmovlb.s8 q0, q0
1486 ; CHECK-NEXT: vand q2, q2, q3
1487 ; CHECK-NEXT: vmovlb.s16 q1, q1
1488 ; CHECK-NEXT: vmovlb.s16 q0, q0
1489 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1490 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1493 %c = icmp eq <4 x i8> %b, zeroinitializer
1494 %xx = sext <4 x i8> %x to <4 x i64>
1495 %yy = sext <4 x i16> %y to <4 x i64>
1496 %m = mul <4 x i64> %xx, %yy
1497 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1498 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1502 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1503 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
1504 ; CHECK: @ %bb.0: @ %entry
1505 ; CHECK-NEXT: vmov.i32 q3, #0xff
1506 ; CHECK-NEXT: vmovlb.u16 q1, q1
1507 ; CHECK-NEXT: vand q2, q2, q3
1508 ; CHECK-NEXT: vand q0, q0, q3
1509 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1510 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1513 %c = icmp eq <4 x i8> %b, zeroinitializer
1514 %xx = zext <4 x i8> %x to <4 x i32>
1515 %yy = zext <4 x i16> %y to <4 x i32>
1516 %mm = mul <4 x i32> %xx, %yy
1517 %m = zext <4 x i32> %mm to <4 x i64>
1518 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1519 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1523 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1524 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
1525 ; CHECK: @ %bb.0: @ %entry
1526 ; CHECK-NEXT: vmov.i32 q3, #0xff
1527 ; CHECK-NEXT: vmovlb.s8 q0, q0
1528 ; CHECK-NEXT: vand q2, q2, q3
1529 ; CHECK-NEXT: vmovlb.s16 q1, q1
1530 ; CHECK-NEXT: vmovlb.s16 q0, q0
1531 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1532 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1535 %c = icmp eq <4 x i8> %b, zeroinitializer
1536 %xx = sext <4 x i8> %x to <4 x i32>
1537 %yy = sext <4 x i16> %y to <4 x i32>
1538 %mm = mul <4 x i32> %xx, %yy
1539 %m = sext <4 x i32> %mm to <4 x i64>
1540 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1541 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1545 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1546 ; CHECK-LABEL: add_v2i8_v2i64_zext:
1547 ; CHECK: @ %bb.0: @ %entry
1548 ; CHECK-NEXT: vmov.i64 q3, #0xff
1549 ; CHECK-NEXT: vand q1, q1, q3
1550 ; CHECK-NEXT: vand q0, q0, q3
1551 ; CHECK-NEXT: vmov r0, s6
1552 ; CHECK-NEXT: vmov r1, s2
1553 ; CHECK-NEXT: vmov r2, s4
1554 ; CHECK-NEXT: vand q1, q2, q3
1555 ; CHECK-NEXT: vmov r3, s0
1556 ; CHECK-NEXT: umull r0, r1, r1, r0
1557 ; CHECK-NEXT: umull r2, r3, r3, r2
1558 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
1559 ; CHECK-NEXT: vmov r0, s4
1560 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
1561 ; CHECK-NEXT: movs r1, #0
1562 ; CHECK-NEXT: cmp r0, #0
1563 ; CHECK-NEXT: csetm r0, eq
1564 ; CHECK-NEXT: bfi r1, r0, #0, #8
1565 ; CHECK-NEXT: vmov r0, s6
1566 ; CHECK-NEXT: vmov.i32 q1, #0x0
1567 ; CHECK-NEXT: cmp r0, #0
1568 ; CHECK-NEXT: csetm r0, eq
1569 ; CHECK-NEXT: bfi r1, r0, #8, #8
1570 ; CHECK-NEXT: vmsr p0, r1
1571 ; CHECK-NEXT: vpsel q0, q0, q1
1572 ; CHECK-NEXT: vmov r0, r1, d1
1573 ; CHECK-NEXT: vmov r2, r3, d0
1574 ; CHECK-NEXT: adds r0, r0, r2
1575 ; CHECK-NEXT: adcs r1, r3
1578 %c = icmp eq <2 x i8> %b, zeroinitializer
1579 %xx = zext <2 x i8> %x to <2 x i64>
1580 %yy = zext <2 x i8> %y to <2 x i64>
1581 %m = mul <2 x i64> %xx, %yy
1582 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1583 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1587 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1588 ; CHECK-LABEL: add_v2i8_v2i64_sext:
1589 ; CHECK: @ %bb.0: @ %entry
1590 ; CHECK-NEXT: vmov.i32 q3, #0xff
1591 ; CHECK-NEXT: movs r1, #0
1592 ; CHECK-NEXT: vand q2, q2, q3
1593 ; CHECK-NEXT: vmov r2, s4
1594 ; CHECK-NEXT: vmov r0, s8
1595 ; CHECK-NEXT: vmov r3, s0
1596 ; CHECK-NEXT: cmp r0, #0
1597 ; CHECK-NEXT: sxtb r2, r2
1598 ; CHECK-NEXT: csetm r0, eq
1599 ; CHECK-NEXT: bfi r1, r0, #0, #8
1600 ; CHECK-NEXT: vmov r0, s10
1601 ; CHECK-NEXT: sxtb r3, r3
1602 ; CHECK-NEXT: smull r2, r3, r3, r2
1603 ; CHECK-NEXT: cmp r0, #0
1604 ; CHECK-NEXT: csetm r0, eq
1605 ; CHECK-NEXT: bfi r1, r0, #8, #8
1606 ; CHECK-NEXT: vmov r0, s6
1607 ; CHECK-NEXT: vmsr p0, r1
1608 ; CHECK-NEXT: vmov r1, s2
1609 ; CHECK-NEXT: vmov.i32 q1, #0x0
1610 ; CHECK-NEXT: sxtb r0, r0
1611 ; CHECK-NEXT: sxtb r1, r1
1612 ; CHECK-NEXT: smull r0, r1, r1, r0
1613 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
1614 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
1615 ; CHECK-NEXT: vpsel q0, q0, q1
1616 ; CHECK-NEXT: vmov r0, r1, d1
1617 ; CHECK-NEXT: vmov r2, r3, d0
1618 ; CHECK-NEXT: adds r0, r0, r2
1619 ; CHECK-NEXT: adcs r1, r3
1622 %c = icmp eq <2 x i8> %b, zeroinitializer
1623 %xx = sext <2 x i8> %x to <2 x i64>
1624 %yy = sext <2 x i8> %y to <2 x i64>
1625 %m = mul <2 x i64> %xx, %yy
1626 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1627 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1631 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
1632 ; CHECK-LABEL: add_v2i64_v2i64:
1633 ; CHECK: @ %bb.0: @ %entry
1634 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
1635 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
1636 ; CHECK-NEXT: vmov r0, r12, d3
1637 ; CHECK-NEXT: vmov r2, lr, d1
1638 ; CHECK-NEXT: vmov r4, r9, d2
1639 ; CHECK-NEXT: vmov.i32 q1, #0x0
1640 ; CHECK-NEXT: vmov r6, r7, d0
1641 ; CHECK-NEXT: umull r1, r8, r2, r0
1642 ; CHECK-NEXT: umull r3, r5, r6, r4
1643 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r1
1644 ; CHECK-NEXT: mla r1, r2, r12, r8
1645 ; CHECK-NEXT: mla r0, lr, r0, r1
1646 ; CHECK-NEXT: mla r1, r6, r9, r5
1647 ; CHECK-NEXT: mla r1, r7, r4, r1
1648 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
1649 ; CHECK-NEXT: vmov r0, r1, d4
1650 ; CHECK-NEXT: orrs r0, r1
1651 ; CHECK-NEXT: mov.w r1, #0
1652 ; CHECK-NEXT: csetm r0, eq
1653 ; CHECK-NEXT: bfi r1, r0, #0, #8
1654 ; CHECK-NEXT: vmov r0, r2, d5
1655 ; CHECK-NEXT: orrs r0, r2
1656 ; CHECK-NEXT: csetm r0, eq
1657 ; CHECK-NEXT: bfi r1, r0, #8, #8
1658 ; CHECK-NEXT: vmsr p0, r1
1659 ; CHECK-NEXT: vpsel q0, q0, q1
1660 ; CHECK-NEXT: vmov r0, r1, d1
1661 ; CHECK-NEXT: vmov r2, r3, d0
1662 ; CHECK-NEXT: adds r0, r0, r2
1663 ; CHECK-NEXT: adcs r1, r3
1664 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
1666 %c = icmp eq <2 x i64> %b, zeroinitializer
1667 %m = mul <2 x i64> %x, %y
1668 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1669 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1673 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) {
1674 ; CHECK-LABEL: add_v4i32_v4i32_acc:
1675 ; CHECK: @ %bb.0: @ %entry
1676 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1677 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1680 %c = icmp eq <4 x i32> %b, zeroinitializer
1681 %m = mul <4 x i32> %x, %y
1682 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1683 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1688 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1689 ; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
1690 ; CHECK: @ %bb.0: @ %entry
1691 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1692 ; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1
1695 %c = icmp eq <4 x i32> %b, zeroinitializer
1696 %xx = zext <4 x i32> %x to <4 x i64>
1697 %yy = zext <4 x i32> %y to <4 x i64>
1698 %m = mul <4 x i64> %xx, %yy
1699 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1700 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1705 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1706 ; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1707 ; CHECK: @ %bb.0: @ %entry
1708 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1709 ; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1
1712 %c = icmp eq <4 x i32> %b, zeroinitializer
1713 %xx = sext <4 x i32> %x to <4 x i64>
1714 %yy = sext <4 x i32> %y to <4 x i64>
1715 %m = mul <4 x i64> %xx, %yy
1716 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1717 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1722 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1723 ; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1724 ; CHECK: @ %bb.0: @ %entry
1725 ; CHECK-NEXT: .save {r7, lr}
1726 ; CHECK-NEXT: push {r7, lr}
1727 ; CHECK-NEXT: vmov r2, s8
1728 ; CHECK-NEXT: movs r3, #0
1729 ; CHECK-NEXT: vmullb.u32 q3, q0, q1
1730 ; CHECK-NEXT: vmov.i32 q0, #0x0
1731 ; CHECK-NEXT: cmp r2, #0
1732 ; CHECK-NEXT: csetm r2, eq
1733 ; CHECK-NEXT: bfi r3, r2, #0, #8
1734 ; CHECK-NEXT: vmov r2, s10
1735 ; CHECK-NEXT: cmp r2, #0
1736 ; CHECK-NEXT: csetm r2, eq
1737 ; CHECK-NEXT: bfi r3, r2, #8, #8
1738 ; CHECK-NEXT: vmsr p0, r3
1739 ; CHECK-NEXT: vpsel q0, q3, q0
1740 ; CHECK-NEXT: vmov lr, r12, d1
1741 ; CHECK-NEXT: vmov r3, r2, d0
1742 ; CHECK-NEXT: adds.w r3, r3, lr
1743 ; CHECK-NEXT: adc.w r2, r2, r12
1744 ; CHECK-NEXT: adds r0, r0, r3
1745 ; CHECK-NEXT: adcs r1, r2
1746 ; CHECK-NEXT: pop {r7, pc}
1748 %c = icmp eq <2 x i32> %b, zeroinitializer
1749 %xx = zext <2 x i32> %x to <2 x i64>
1750 %yy = zext <2 x i32> %y to <2 x i64>
1751 %m = mul <2 x i64> %xx, %yy
1752 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1753 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1758 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1759 ; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1760 ; CHECK: @ %bb.0: @ %entry
1761 ; CHECK-NEXT: .save {r7, lr}
1762 ; CHECK-NEXT: push {r7, lr}
1763 ; CHECK-NEXT: vmov r2, s8
1764 ; CHECK-NEXT: movs r3, #0
1765 ; CHECK-NEXT: vmullb.s32 q3, q0, q1
1766 ; CHECK-NEXT: vmov.i32 q0, #0x0
1767 ; CHECK-NEXT: cmp r2, #0
1768 ; CHECK-NEXT: csetm r2, eq
1769 ; CHECK-NEXT: bfi r3, r2, #0, #8
1770 ; CHECK-NEXT: vmov r2, s10
1771 ; CHECK-NEXT: cmp r2, #0
1772 ; CHECK-NEXT: csetm r2, eq
1773 ; CHECK-NEXT: bfi r3, r2, #8, #8
1774 ; CHECK-NEXT: vmsr p0, r3
1775 ; CHECK-NEXT: vpsel q0, q3, q0
1776 ; CHECK-NEXT: vmov lr, r12, d1
1777 ; CHECK-NEXT: vmov r3, r2, d0
1778 ; CHECK-NEXT: adds.w r3, r3, lr
1779 ; CHECK-NEXT: adc.w r2, r2, r12
1780 ; CHECK-NEXT: adds r0, r0, r3
1781 ; CHECK-NEXT: adcs r1, r2
1782 ; CHECK-NEXT: pop {r7, pc}
1784 %c = icmp eq <2 x i32> %b, zeroinitializer
1785 %xx = sext <2 x i32> %x to <2 x i64>
1786 %yy = sext <2 x i32> %y to <2 x i64>
1787 %m = mul <2 x i64> %xx, %yy
1788 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1789 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1794 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1795 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1796 ; CHECK: @ %bb.0: @ %entry
1797 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1798 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
1801 %c = icmp eq <8 x i16> %b, zeroinitializer
1802 %xx = zext <8 x i16> %x to <8 x i32>
1803 %yy = zext <8 x i16> %y to <8 x i32>
1804 %m = mul <8 x i32> %xx, %yy
1805 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1806 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1811 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1812 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1813 ; CHECK: @ %bb.0: @ %entry
1814 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1815 ; CHECK-NEXT: vmlavat.s16 r0, q0, q1
1818 %c = icmp eq <8 x i16> %b, zeroinitializer
1819 %xx = sext <8 x i16> %x to <8 x i32>
1820 %yy = sext <8 x i16> %y to <8 x i32>
1821 %m = mul <8 x i32> %xx, %yy
1822 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1823 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1828 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1829 ; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1830 ; CHECK: @ %bb.0: @ %entry
1831 ; CHECK-NEXT: vmovlb.u16 q2, q2
1832 ; CHECK-NEXT: vmovlb.u16 q1, q1
1833 ; CHECK-NEXT: vmovlb.u16 q0, q0
1834 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1835 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1838 %c = icmp eq <4 x i16> %b, zeroinitializer
1839 %xx = zext <4 x i16> %x to <4 x i32>
1840 %yy = zext <4 x i16> %y to <4 x i32>
1841 %m = mul <4 x i32> %xx, %yy
1842 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1843 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1848 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1849 ; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1850 ; CHECK: @ %bb.0: @ %entry
1851 ; CHECK-NEXT: vmovlb.u16 q2, q2
1852 ; CHECK-NEXT: vmovlb.s16 q1, q1
1853 ; CHECK-NEXT: vmovlb.s16 q0, q0
1854 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1855 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1858 %c = icmp eq <4 x i16> %b, zeroinitializer
1859 %xx = sext <4 x i16> %x to <4 x i32>
1860 %yy = sext <4 x i16> %y to <4 x i32>
1861 %m = mul <4 x i32> %xx, %yy
1862 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1863 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1868 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) {
1869 ; CHECK-LABEL: add_v8i16_v8i16_acc:
1870 ; CHECK: @ %bb.0: @ %entry
1871 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1872 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
1873 ; CHECK-NEXT: uxth r0, r0
1876 %c = icmp eq <8 x i16> %b, zeroinitializer
1877 %m = mul <8 x i16> %x, %y
1878 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
1879 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
1884 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1885 ; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1886 ; CHECK: @ %bb.0: @ %entry
1887 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1888 ; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
1891 %c = icmp eq <8 x i16> %b, zeroinitializer
1892 %xx = zext <8 x i16> %x to <8 x i64>
1893 %yy = zext <8 x i16> %y to <8 x i64>
1894 %m = mul <8 x i64> %xx, %yy
1895 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1896 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1901 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1902 ; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1903 ; CHECK: @ %bb.0: @ %entry
1904 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1905 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
1908 %c = icmp eq <8 x i16> %b, zeroinitializer
1909 %xx = sext <8 x i16> %x to <8 x i64>
1910 %yy = sext <8 x i16> %y to <8 x i64>
1911 %m = mul <8 x i64> %xx, %yy
1912 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1913 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1918 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1919 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1920 ; CHECK: @ %bb.0: @ %entry
1921 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1922 ; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
1925 %c = icmp eq <8 x i16> %b, zeroinitializer
1926 %xx = zext <8 x i16> %x to <8 x i32>
1927 %yy = zext <8 x i16> %y to <8 x i32>
1928 %m = mul <8 x i32> %xx, %yy
1929 %ma = zext <8 x i32> %m to <8 x i64>
1930 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1931 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1936 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1937 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1938 ; CHECK: @ %bb.0: @ %entry
1939 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1940 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
1943 %c = icmp eq <8 x i16> %b, zeroinitializer
1944 %xx = sext <8 x i16> %x to <8 x i32>
1945 %yy = sext <8 x i16> %y to <8 x i32>
1946 %m = mul <8 x i32> %xx, %yy
1947 %ma = sext <8 x i32> %m to <8 x i64>
1948 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1949 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1954 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1955 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1956 ; CHECK: @ %bb.0: @ %entry
1957 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1958 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0
1961 %c = icmp eq <8 x i16> %b, zeroinitializer
1962 %xx = sext <8 x i16> %x to <8 x i32>
1963 %m = mul <8 x i32> %xx, %xx
1964 %ma = zext <8 x i32> %m to <8 x i64>
1965 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1966 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1971 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
1972 ; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1973 ; CHECK: @ %bb.0: @ %entry
1974 ; CHECK-NEXT: .save {r7, lr}
1975 ; CHECK-NEXT: push {r7, lr}
1976 ; CHECK-NEXT: vmov.i64 q3, #0xffff
1977 ; CHECK-NEXT: vand q1, q1, q3
1978 ; CHECK-NEXT: vand q0, q0, q3
1979 ; CHECK-NEXT: vmov r2, s6
1980 ; CHECK-NEXT: vmov r3, s2
1981 ; CHECK-NEXT: umull lr, r12, r3, r2
1982 ; CHECK-NEXT: vmov r3, s4
1983 ; CHECK-NEXT: vmov r2, s0
1984 ; CHECK-NEXT: vand q1, q2, q3
1985 ; CHECK-NEXT: umull r2, r3, r2, r3
1986 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
1987 ; CHECK-NEXT: vmov r2, s4
1988 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
1989 ; CHECK-NEXT: movs r3, #0
1990 ; CHECK-NEXT: cmp r2, #0
1991 ; CHECK-NEXT: csetm r2, eq
1992 ; CHECK-NEXT: bfi r3, r2, #0, #8
1993 ; CHECK-NEXT: vmov r2, s6
1994 ; CHECK-NEXT: vmov.i32 q1, #0x0
1995 ; CHECK-NEXT: cmp r2, #0
1996 ; CHECK-NEXT: csetm r2, eq
1997 ; CHECK-NEXT: bfi r3, r2, #8, #8
1998 ; CHECK-NEXT: vmsr p0, r3
1999 ; CHECK-NEXT: vpsel q0, q0, q1
2000 ; CHECK-NEXT: vmov lr, r12, d1
2001 ; CHECK-NEXT: vmov r3, r2, d0
2002 ; CHECK-NEXT: adds.w r3, r3, lr
2003 ; CHECK-NEXT: adc.w r2, r2, r12
2004 ; CHECK-NEXT: adds r0, r0, r3
2005 ; CHECK-NEXT: adcs r1, r2
2006 ; CHECK-NEXT: pop {r7, pc}
2008 %c = icmp eq <2 x i16> %b, zeroinitializer
2009 %xx = zext <2 x i16> %x to <2 x i64>
2010 %yy = zext <2 x i16> %y to <2 x i64>
2011 %m = mul <2 x i64> %xx, %yy
2012 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2013 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2018 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
2019 ; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
2020 ; CHECK: @ %bb.0: @ %entry
2021 ; CHECK-NEXT: .save {r7, lr}
2022 ; CHECK-NEXT: push {r7, lr}
2023 ; CHECK-NEXT: vmov.i32 q3, #0xffff
2024 ; CHECK-NEXT: movs r3, #0
2025 ; CHECK-NEXT: vand q2, q2, q3
2026 ; CHECK-NEXT: vmov r2, s8
2027 ; CHECK-NEXT: cmp r2, #0
2028 ; CHECK-NEXT: csetm r2, eq
2029 ; CHECK-NEXT: bfi r3, r2, #0, #8
2030 ; CHECK-NEXT: vmov r2, s10
2031 ; CHECK-NEXT: cmp r2, #0
2032 ; CHECK-NEXT: csetm r2, eq
2033 ; CHECK-NEXT: bfi r3, r2, #8, #8
2034 ; CHECK-NEXT: vmov r2, s6
2035 ; CHECK-NEXT: vmsr p0, r3
2036 ; CHECK-NEXT: vmov r3, s2
2037 ; CHECK-NEXT: sxth r2, r2
2038 ; CHECK-NEXT: sxth r3, r3
2039 ; CHECK-NEXT: smull lr, r12, r3, r2
2040 ; CHECK-NEXT: vmov r3, s4
2041 ; CHECK-NEXT: vmov r2, s0
2042 ; CHECK-NEXT: vmov.i32 q1, #0x0
2043 ; CHECK-NEXT: sxth r3, r3
2044 ; CHECK-NEXT: sxth r2, r2
2045 ; CHECK-NEXT: smull r2, r3, r2, r3
2046 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2047 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2048 ; CHECK-NEXT: vpsel q0, q0, q1
2049 ; CHECK-NEXT: vmov lr, r12, d1
2050 ; CHECK-NEXT: vmov r3, r2, d0
2051 ; CHECK-NEXT: adds.w r3, r3, lr
2052 ; CHECK-NEXT: adc.w r2, r2, r12
2053 ; CHECK-NEXT: adds r0, r0, r3
2054 ; CHECK-NEXT: adcs r1, r2
2055 ; CHECK-NEXT: pop {r7, pc}
2057 %c = icmp eq <2 x i16> %b, zeroinitializer
2058 %xx = sext <2 x i16> %x to <2 x i64>
2059 %yy = sext <2 x i16> %y to <2 x i64>
2060 %m = mul <2 x i64> %xx, %yy
2061 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2062 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2067 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2068 ; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
2069 ; CHECK: @ %bb.0: @ %entry
2070 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2071 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2074 %c = icmp eq <16 x i8> %b, zeroinitializer
2075 %xx = zext <16 x i8> %x to <16 x i32>
2076 %yy = zext <16 x i8> %y to <16 x i32>
2077 %m = mul <16 x i32> %xx, %yy
2078 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2079 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2084 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2085 ; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
2086 ; CHECK: @ %bb.0: @ %entry
2087 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2088 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2091 %c = icmp eq <16 x i8> %b, zeroinitializer
2092 %xx = sext <16 x i8> %x to <16 x i32>
2093 %yy = sext <16 x i8> %y to <16 x i32>
2094 %m = mul <16 x i32> %xx, %yy
2095 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2096 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2101 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2102 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
2103 ; CHECK: @ %bb.0: @ %entry
2104 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2105 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2108 %c = icmp eq <16 x i8> %b, zeroinitializer
2109 %xx = zext <16 x i8> %x to <16 x i16>
2110 %yy = zext <16 x i8> %y to <16 x i16>
2111 %m = mul <16 x i16> %xx, %yy
2112 %ma = zext <16 x i16> %m to <16 x i32>
2113 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2114 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2119 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2120 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
2121 ; CHECK: @ %bb.0: @ %entry
2122 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2123 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2126 %c = icmp eq <16 x i8> %b, zeroinitializer
2127 %xx = sext <16 x i8> %x to <16 x i16>
2128 %yy = sext <16 x i8> %y to <16 x i16>
2129 %m = mul <16 x i16> %xx, %yy
2130 %ma = sext <16 x i16> %m to <16 x i32>
2131 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2132 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2137 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2138 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
2139 ; CHECK: @ %bb.0: @ %entry
2140 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2141 ; CHECK-NEXT: vmlavat.s8 r0, q0, q0
2144 %c = icmp eq <16 x i8> %b, zeroinitializer
2145 %xx = sext <16 x i8> %x to <16 x i16>
2146 %m = mul <16 x i16> %xx, %xx
2147 %ma = zext <16 x i16> %m to <16 x i32>
2148 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2149 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2154 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2155 ; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
2156 ; CHECK: @ %bb.0: @ %entry
2157 ; CHECK-NEXT: vmov.i32 q3, #0xff
2158 ; CHECK-NEXT: vand q2, q2, q3
2159 ; CHECK-NEXT: vand q1, q1, q3
2160 ; CHECK-NEXT: vand q0, q0, q3
2161 ; CHECK-NEXT: vpt.i32 eq, q2, zr
2162 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
2165 %c = icmp eq <4 x i8> %b, zeroinitializer
2166 %xx = zext <4 x i8> %x to <4 x i32>
2167 %yy = zext <4 x i8> %y to <4 x i32>
2168 %m = mul <4 x i32> %xx, %yy
2169 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2170 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2175 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2176 ; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
2177 ; CHECK: @ %bb.0: @ %entry
2178 ; CHECK-NEXT: vmov.i32 q3, #0xff
2179 ; CHECK-NEXT: vmovlb.s8 q1, q1
2180 ; CHECK-NEXT: vmovlb.s8 q0, q0
2181 ; CHECK-NEXT: vand q2, q2, q3
2182 ; CHECK-NEXT: vmovlb.s16 q1, q1
2183 ; CHECK-NEXT: vmovlb.s16 q0, q0
2184 ; CHECK-NEXT: vpt.i32 eq, q2, zr
2185 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
2188 %c = icmp eq <4 x i8> %b, zeroinitializer
2189 %xx = sext <4 x i8> %x to <4 x i32>
2190 %yy = sext <4 x i8> %y to <4 x i32>
2191 %m = mul <4 x i32> %xx, %yy
2192 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2193 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2198 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2199 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
2200 ; CHECK: @ %bb.0: @ %entry
2201 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2202 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2203 ; CHECK-NEXT: uxth r0, r0
2206 %c = icmp eq <16 x i8> %b, zeroinitializer
2207 %xx = zext <16 x i8> %x to <16 x i16>
2208 %yy = zext <16 x i8> %y to <16 x i16>
2209 %m = mul <16 x i16> %xx, %yy
2210 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2211 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2216 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2217 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
2218 ; CHECK: @ %bb.0: @ %entry
2219 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2220 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2221 ; CHECK-NEXT: sxth r0, r0
2224 %c = icmp eq <16 x i8> %b, zeroinitializer
2225 %xx = sext <16 x i8> %x to <16 x i16>
2226 %yy = sext <16 x i8> %y to <16 x i16>
2227 %m = mul <16 x i16> %xx, %yy
2228 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2229 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2234 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2235 ; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
2236 ; CHECK: @ %bb.0: @ %entry
2237 ; CHECK-NEXT: vmovlb.u8 q2, q2
2238 ; CHECK-NEXT: vmovlb.u8 q1, q1
2239 ; CHECK-NEXT: vmovlb.u8 q0, q0
2240 ; CHECK-NEXT: vpt.i16 eq, q2, zr
2241 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
2242 ; CHECK-NEXT: uxth r0, r0
2245 %c = icmp eq <8 x i8> %b, zeroinitializer
2246 %xx = zext <8 x i8> %x to <8 x i16>
2247 %yy = zext <8 x i8> %y to <8 x i16>
2248 %m = mul <8 x i16> %xx, %yy
2249 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2250 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2255 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2256 ; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
2257 ; CHECK: @ %bb.0: @ %entry
2258 ; CHECK-NEXT: vmovlb.u8 q2, q2
2259 ; CHECK-NEXT: vmovlb.s8 q1, q1
2260 ; CHECK-NEXT: vmovlb.s8 q0, q0
2261 ; CHECK-NEXT: vpt.i16 eq, q2, zr
2262 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
2263 ; CHECK-NEXT: sxth r0, r0
2266 %c = icmp eq <8 x i8> %b, zeroinitializer
2267 %xx = sext <8 x i8> %x to <8 x i16>
2268 %yy = sext <8 x i8> %y to <8 x i16>
2269 %m = mul <8 x i16> %xx, %yy
2270 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2271 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2276 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) {
2277 ; CHECK-LABEL: add_v16i8_v16i8_acc:
2278 ; CHECK: @ %bb.0: @ %entry
2279 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2280 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2281 ; CHECK-NEXT: uxtb r0, r0
2284 %c = icmp eq <16 x i8> %b, zeroinitializer
2285 %m = mul <16 x i8> %x, %y
2286 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
2287 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
2292 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2293 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
2294 ; CHECK: @ %bb.0: @ %entry
2295 ; CHECK-NEXT: .save {r4, r5, r7, lr}
2296 ; CHECK-NEXT: push {r4, r5, r7, lr}
2297 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2298 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2299 ; CHECK-NEXT: .pad #32
2300 ; CHECK-NEXT: sub sp, #32
2301 ; CHECK-NEXT: vmov q3, q0
2302 ; CHECK-NEXT: vmov.i8 q0, #0x0
2303 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
2304 ; CHECK-NEXT: vmov.i8 q2, #0xff
2305 ; CHECK-NEXT: vpsel q6, q2, q0
2306 ; CHECK-NEXT: vmov q4, q0
2307 ; CHECK-NEXT: vmov.u8 r2, q6[0]
2308 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2309 ; CHECK-NEXT: vmov.16 q0[0], r2
2310 ; CHECK-NEXT: vmov.u8 r2, q6[1]
2311 ; CHECK-NEXT: vmov.16 q0[1], r2
2312 ; CHECK-NEXT: vmov.u8 r2, q6[2]
2313 ; CHECK-NEXT: vmov.16 q0[2], r2
2314 ; CHECK-NEXT: vmov.u8 r2, q6[3]
2315 ; CHECK-NEXT: vmov.16 q0[3], r2
2316 ; CHECK-NEXT: vmov.u8 r2, q6[4]
2317 ; CHECK-NEXT: vmov.16 q0[4], r2
2318 ; CHECK-NEXT: vmov.u8 r2, q6[5]
2319 ; CHECK-NEXT: vmov.16 q0[5], r2
2320 ; CHECK-NEXT: vmov.u8 r2, q6[6]
2321 ; CHECK-NEXT: vmov.16 q0[6], r2
2322 ; CHECK-NEXT: vmov.u8 r2, q6[7]
2323 ; CHECK-NEXT: vmov.16 q0[7], r2
2324 ; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
2325 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2326 ; CHECK-NEXT: vmov.u8 r4, q3[2]
2327 ; CHECK-NEXT: vpsel q7, q2, q4
2328 ; CHECK-NEXT: vmov.u16 r2, q7[2]
2329 ; CHECK-NEXT: vmov.u16 r3, q7[0]
2330 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2331 ; CHECK-NEXT: vmov.u16 r2, q7[3]
2332 ; CHECK-NEXT: vmov.u16 r3, q7[1]
2333 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
2334 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2335 ; CHECK-NEXT: vpsel q0, q2, q4
2336 ; CHECK-NEXT: vmov r2, r3, d0
2337 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
2338 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r3
2339 ; CHECK-NEXT: vmov.u8 r2, q1[1]
2340 ; CHECK-NEXT: vmov.u8 r3, q1[0]
2341 ; CHECK-NEXT: vcmp.i32 ne, q2, zr
2342 ; CHECK-NEXT: vmov q5[2], q5[0], r3, r2
2343 ; CHECK-NEXT: vmov.u8 r3, q3[1]
2344 ; CHECK-NEXT: vmov.u8 r2, q3[0]
2345 ; CHECK-NEXT: vmov.i64 q2, #0xff
2346 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
2347 ; CHECK-NEXT: vand q5, q5, q2
2348 ; CHECK-NEXT: vand q4, q4, q2
2349 ; CHECK-NEXT: vmov r12, s22
2350 ; CHECK-NEXT: vmov r2, s18
2351 ; CHECK-NEXT: vmov r3, s20
2352 ; CHECK-NEXT: vmov.i32 q5, #0x0
2353 ; CHECK-NEXT: umull lr, r12, r2, r12
2354 ; CHECK-NEXT: vmov r2, s16
2355 ; CHECK-NEXT: umull r2, r3, r2, r3
2356 ; CHECK-NEXT: vmov q4[2], q4[0], r2, lr
2357 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r12
2358 ; CHECK-NEXT: vpsel q4, q4, q5
2359 ; CHECK-NEXT: vmov lr, r12, d9
2360 ; CHECK-NEXT: vmov r3, r2, d8
2361 ; CHECK-NEXT: adds.w lr, lr, r3
2362 ; CHECK-NEXT: adc.w r12, r12, r2
2363 ; CHECK-NEXT: vmov r2, r3, d1
2364 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
2365 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
2366 ; CHECK-NEXT: vmov.u8 r2, q1[3]
2367 ; CHECK-NEXT: vmov.u8 r3, q1[2]
2368 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2369 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2370 ; CHECK-NEXT: vmov.u8 r3, q3[3]
2371 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3
2372 ; CHECK-NEXT: vand q0, q0, q2
2373 ; CHECK-NEXT: vand q4, q4, q2
2374 ; CHECK-NEXT: vmov r2, s2
2375 ; CHECK-NEXT: vmov r3, s18
2376 ; CHECK-NEXT: vmov r5, s16
2377 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
2378 ; CHECK-NEXT: vmov r4, s0
2379 ; CHECK-NEXT: umull r2, r3, r3, r2
2380 ; CHECK-NEXT: umull r4, r5, r5, r4
2381 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
2382 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r3
2383 ; CHECK-NEXT: vpsel q0, q0, q5
2384 ; CHECK-NEXT: vmov r2, r3, d0
2385 ; CHECK-NEXT: vmov r5, r4, d1
2386 ; CHECK-NEXT: adds.w r2, r2, lr
2387 ; CHECK-NEXT: adc.w r3, r3, r12
2388 ; CHECK-NEXT: adds.w r12, r2, r5
2389 ; CHECK-NEXT: adc.w lr, r3, r4
2390 ; CHECK-NEXT: vmov.u16 r5, q7[6]
2391 ; CHECK-NEXT: vmov.u16 r4, q7[4]
2392 ; CHECK-NEXT: vmov.u8 r2, q3[4]
2393 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2394 ; CHECK-NEXT: vmov.u16 r5, q7[7]
2395 ; CHECK-NEXT: vmov.u16 r4, q7[5]
2396 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2397 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2398 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2399 ; CHECK-NEXT: vpsel q0, q0, q4
2400 ; CHECK-NEXT: vmov r5, r4, d0
2401 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
2402 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
2403 ; CHECK-NEXT: vmov.u8 r5, q1[5]
2404 ; CHECK-NEXT: vmov.u8 r4, q1[4]
2405 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
2406 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5
2407 ; CHECK-NEXT: vmov.u8 r4, q3[5]
2408 ; CHECK-NEXT: vmov q7[2], q7[0], r2, r4
2409 ; CHECK-NEXT: vand q4, q4, q2
2410 ; CHECK-NEXT: vand q7, q7, q2
2411 ; CHECK-NEXT: vmov r5, s18
2412 ; CHECK-NEXT: vmov r2, s30
2413 ; CHECK-NEXT: vmov r3, s28
2414 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
2415 ; CHECK-NEXT: vmov r4, s16
2416 ; CHECK-NEXT: umull r2, r5, r2, r5
2417 ; CHECK-NEXT: umull r3, r4, r3, r4
2418 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
2419 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5
2420 ; CHECK-NEXT: vpsel q4, q4, q5
2421 ; CHECK-NEXT: vmov r2, r3, d8
2422 ; CHECK-NEXT: vmov r5, r4, d9
2423 ; CHECK-NEXT: adds.w r2, r2, r12
2424 ; CHECK-NEXT: adc.w r3, r3, lr
2425 ; CHECK-NEXT: adds.w r12, r2, r5
2426 ; CHECK-NEXT: adc.w lr, r3, r4
2427 ; CHECK-NEXT: vmov r5, r4, d1
2428 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2429 ; CHECK-NEXT: vmov.u8 r2, q3[6]
2430 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2431 ; CHECK-NEXT: vmov.u8 r5, q1[7]
2432 ; CHECK-NEXT: vmov.u8 r4, q1[6]
2433 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2434 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2435 ; CHECK-NEXT: vmov.u8 r4, q3[7]
2436 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r4
2437 ; CHECK-NEXT: vand q0, q0, q2
2438 ; CHECK-NEXT: vand q4, q4, q2
2439 ; CHECK-NEXT: vmov r5, s2
2440 ; CHECK-NEXT: vmov r2, s18
2441 ; CHECK-NEXT: vmov r3, s16
2442 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
2443 ; CHECK-NEXT: vmov r4, s0
2444 ; CHECK-NEXT: umull r2, r5, r2, r5
2445 ; CHECK-NEXT: umull r3, r4, r3, r4
2446 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2447 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2448 ; CHECK-NEXT: vpsel q0, q0, q5
2449 ; CHECK-NEXT: vmov r2, r3, d0
2450 ; CHECK-NEXT: vmov r5, r4, d1
2451 ; CHECK-NEXT: adds.w r2, r2, r12
2452 ; CHECK-NEXT: adc.w r3, r3, lr
2453 ; CHECK-NEXT: adds.w r12, r2, r5
2454 ; CHECK-NEXT: vmov.u8 r5, q6[8]
2455 ; CHECK-NEXT: adc.w lr, r3, r4
2456 ; CHECK-NEXT: vmov.16 q0[0], r5
2457 ; CHECK-NEXT: vmov.u8 r5, q6[9]
2458 ; CHECK-NEXT: vmov.16 q0[1], r5
2459 ; CHECK-NEXT: vmov.u8 r5, q6[10]
2460 ; CHECK-NEXT: vmov.16 q0[2], r5
2461 ; CHECK-NEXT: vmov.u8 r5, q6[11]
2462 ; CHECK-NEXT: vmov.16 q0[3], r5
2463 ; CHECK-NEXT: vmov.u8 r5, q6[12]
2464 ; CHECK-NEXT: vmov.16 q0[4], r5
2465 ; CHECK-NEXT: vmov.u8 r5, q6[13]
2466 ; CHECK-NEXT: vmov.16 q0[5], r5
2467 ; CHECK-NEXT: vmov.u8 r5, q6[14]
2468 ; CHECK-NEXT: vmov.16 q0[6], r5
2469 ; CHECK-NEXT: vmov.u8 r5, q6[15]
2470 ; CHECK-NEXT: vmov.16 q0[7], r5
2471 ; CHECK-NEXT: vmov.u8 r2, q3[8]
2472 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2473 ; CHECK-NEXT: vpsel q6, q7, q4
2474 ; CHECK-NEXT: vmov.u16 r5, q6[2]
2475 ; CHECK-NEXT: vmov.u16 r4, q6[0]
2476 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2477 ; CHECK-NEXT: vmov.u16 r5, q6[3]
2478 ; CHECK-NEXT: vmov.u16 r4, q6[1]
2479 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2480 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2481 ; CHECK-NEXT: vpsel q0, q7, q4
2482 ; CHECK-NEXT: vmov r5, r4, d0
2483 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
2484 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
2485 ; CHECK-NEXT: vmov.u8 r5, q1[9]
2486 ; CHECK-NEXT: vmov.u8 r4, q1[8]
2487 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
2488 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5
2489 ; CHECK-NEXT: vmov.u8 r4, q3[9]
2490 ; CHECK-NEXT: vmov q7[2], q7[0], r2, r4
2491 ; CHECK-NEXT: vand q4, q4, q2
2492 ; CHECK-NEXT: vand q7, q7, q2
2493 ; CHECK-NEXT: vmov r5, s18
2494 ; CHECK-NEXT: vmov r2, s30
2495 ; CHECK-NEXT: vmov r4, s16
2496 ; CHECK-NEXT: vmov r3, s28
2497 ; CHECK-NEXT: umull r2, r5, r2, r5
2498 ; CHECK-NEXT: umull r3, r4, r3, r4
2499 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
2500 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5
2501 ; CHECK-NEXT: vpsel q4, q4, q5
2502 ; CHECK-NEXT: vmov r2, r3, d8
2503 ; CHECK-NEXT: vmov r5, r4, d9
2504 ; CHECK-NEXT: adds.w r2, r2, r12
2505 ; CHECK-NEXT: adc.w r3, r3, lr
2506 ; CHECK-NEXT: adds.w r12, r2, r5
2507 ; CHECK-NEXT: adc.w lr, r3, r4
2508 ; CHECK-NEXT: vmov r5, r4, d1
2509 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2510 ; CHECK-NEXT: vmov.u8 r2, q3[10]
2511 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2512 ; CHECK-NEXT: vmov.u8 r5, q1[11]
2513 ; CHECK-NEXT: vmov.u8 r4, q1[10]
2514 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2515 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2516 ; CHECK-NEXT: vmov.u8 r4, q3[11]
2517 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r4
2518 ; CHECK-NEXT: vand q0, q0, q2
2519 ; CHECK-NEXT: vand q4, q4, q2
2520 ; CHECK-NEXT: vmov r5, s2
2521 ; CHECK-NEXT: vmov r2, s18
2522 ; CHECK-NEXT: vmov r3, s16
2523 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
2524 ; CHECK-NEXT: vmov r4, s0
2525 ; CHECK-NEXT: umull r2, r5, r2, r5
2526 ; CHECK-NEXT: umull r3, r4, r3, r4
2527 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2528 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2529 ; CHECK-NEXT: vpsel q0, q0, q5
2530 ; CHECK-NEXT: vmov r2, r3, d0
2531 ; CHECK-NEXT: vmov r5, r4, d1
2532 ; CHECK-NEXT: adds.w r2, r2, r12
2533 ; CHECK-NEXT: adc.w r3, r3, lr
2534 ; CHECK-NEXT: adds.w r12, r2, r5
2535 ; CHECK-NEXT: adc.w lr, r3, r4
2536 ; CHECK-NEXT: vmov.u16 r5, q6[6]
2537 ; CHECK-NEXT: vmov.u16 r4, q6[4]
2538 ; CHECK-NEXT: vmov.u8 r2, q3[12]
2539 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2540 ; CHECK-NEXT: vmov.u16 r5, q6[7]
2541 ; CHECK-NEXT: vmov.u16 r4, q6[5]
2542 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2543 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2544 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2545 ; CHECK-NEXT: vpsel q0, q0, q4
2546 ; CHECK-NEXT: vmov r5, r4, d0
2547 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
2548 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
2549 ; CHECK-NEXT: vmov.u8 r5, q1[13]
2550 ; CHECK-NEXT: vmov.u8 r4, q1[12]
2551 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
2552 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5
2553 ; CHECK-NEXT: vmov.u8 r4, q3[13]
2554 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r4
2555 ; CHECK-NEXT: vand q4, q4, q2
2556 ; CHECK-NEXT: vand q6, q6, q2
2557 ; CHECK-NEXT: vmov r5, s18
2558 ; CHECK-NEXT: vmov r2, s26
2559 ; CHECK-NEXT: vmov r4, s16
2560 ; CHECK-NEXT: vmov r3, s24
2561 ; CHECK-NEXT: umull r2, r5, r2, r5
2562 ; CHECK-NEXT: umull r3, r4, r3, r4
2563 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
2564 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5
2565 ; CHECK-NEXT: vpsel q4, q4, q5
2566 ; CHECK-NEXT: vmov r2, r3, d8
2567 ; CHECK-NEXT: vmov r5, r4, d9
2568 ; CHECK-NEXT: adds.w r2, r2, r12
2569 ; CHECK-NEXT: adc.w r3, r3, lr
2570 ; CHECK-NEXT: adds.w r12, r2, r5
2571 ; CHECK-NEXT: adc.w lr, r3, r4
2572 ; CHECK-NEXT: vmov r5, r4, d1
2573 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2574 ; CHECK-NEXT: vmov.u8 r2, q3[14]
2575 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2576 ; CHECK-NEXT: vmov.u8 r5, q1[15]
2577 ; CHECK-NEXT: vmov.u8 r4, q1[14]
2578 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2579 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2580 ; CHECK-NEXT: vmov.u8 r4, q3[15]
2581 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r4
2582 ; CHECK-NEXT: vand q0, q0, q2
2583 ; CHECK-NEXT: vand q1, q1, q2
2584 ; CHECK-NEXT: vmov r5, s2
2585 ; CHECK-NEXT: vmov r2, s6
2586 ; CHECK-NEXT: vmov r4, s0
2587 ; CHECK-NEXT: vmov r3, s4
2588 ; CHECK-NEXT: umull r2, r5, r2, r5
2589 ; CHECK-NEXT: umull r3, r4, r3, r4
2590 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2591 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2592 ; CHECK-NEXT: vpsel q0, q0, q5
2593 ; CHECK-NEXT: vmov r2, r3, d0
2594 ; CHECK-NEXT: vmov r5, r4, d1
2595 ; CHECK-NEXT: adds.w r2, r2, r12
2596 ; CHECK-NEXT: adc.w r3, r3, lr
2597 ; CHECK-NEXT: adds r2, r2, r5
2598 ; CHECK-NEXT: adcs r3, r4
2599 ; CHECK-NEXT: adds r0, r0, r2
2600 ; CHECK-NEXT: adcs r1, r3
2601 ; CHECK-NEXT: add sp, #32
2602 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2603 ; CHECK-NEXT: pop {r4, r5, r7, pc}
2605 %c = icmp eq <16 x i8> %b, zeroinitializer
2606 %xx = zext <16 x i8> %x to <16 x i64>
2607 %yy = zext <16 x i8> %y to <16 x i64>
2608 %m = mul <16 x i64> %xx, %yy
2609 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2610 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2615 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2616 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
2617 ; CHECK: @ %bb.0: @ %entry
2618 ; CHECK-NEXT: .save {r4, r5, r7, lr}
2619 ; CHECK-NEXT: push {r4, r5, r7, lr}
2620 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2621 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2622 ; CHECK-NEXT: .pad #16
2623 ; CHECK-NEXT: sub sp, #16
2624 ; CHECK-NEXT: vmov q3, q0
2625 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
2626 ; CHECK-NEXT: vmov.i8 q0, #0x0
2627 ; CHECK-NEXT: vmov.i8 q2, #0xff
2628 ; CHECK-NEXT: vpsel q5, q2, q0
2629 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2630 ; CHECK-NEXT: vmov.u8 r2, q5[0]
2631 ; CHECK-NEXT: vmov.s8 r4, q1[2]
2632 ; CHECK-NEXT: vmov.16 q4[0], r2
2633 ; CHECK-NEXT: vmov.u8 r2, q5[1]
2634 ; CHECK-NEXT: vmov.16 q4[1], r2
2635 ; CHECK-NEXT: vmov.u8 r2, q5[2]
2636 ; CHECK-NEXT: vmov.16 q4[2], r2
2637 ; CHECK-NEXT: vmov.u8 r2, q5[3]
2638 ; CHECK-NEXT: vmov.16 q4[3], r2
2639 ; CHECK-NEXT: vmov.u8 r2, q5[4]
2640 ; CHECK-NEXT: vmov.16 q4[4], r2
2641 ; CHECK-NEXT: vmov.u8 r2, q5[5]
2642 ; CHECK-NEXT: vmov.16 q4[5], r2
2643 ; CHECK-NEXT: vmov.u8 r2, q5[6]
2644 ; CHECK-NEXT: vmov.16 q4[6], r2
2645 ; CHECK-NEXT: vmov.u8 r2, q5[7]
2646 ; CHECK-NEXT: vmov.16 q4[7], r2
2647 ; CHECK-NEXT: vmov.s8 r5, q3[2]
2648 ; CHECK-NEXT: vcmp.i16 ne, q4, zr
2649 ; CHECK-NEXT: smull r4, r5, r5, r4
2650 ; CHECK-NEXT: vpsel q6, q2, q0
2651 ; CHECK-NEXT: vmov.u16 r2, q6[2]
2652 ; CHECK-NEXT: vmov.u16 r3, q6[0]
2653 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2
2654 ; CHECK-NEXT: vmov.u16 r2, q6[3]
2655 ; CHECK-NEXT: vmov.u16 r3, q6[1]
2656 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r2
2657 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
2658 ; CHECK-NEXT: vpsel q7, q2, q0
2659 ; CHECK-NEXT: vmov r2, r3, d14
2660 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
2661 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
2662 ; CHECK-NEXT: vmov.s8 r2, q1[1]
2663 ; CHECK-NEXT: vmov.s8 r3, q3[1]
2664 ; CHECK-NEXT: vcmp.i32 ne, q4, zr
2665 ; CHECK-NEXT: smull lr, r12, r3, r2
2666 ; CHECK-NEXT: vmov.s8 r3, q1[0]
2667 ; CHECK-NEXT: vmov.s8 r2, q3[0]
2668 ; CHECK-NEXT: vmov.i32 q4, #0x0
2669 ; CHECK-NEXT: smull r2, r3, r2, r3
2670 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2671 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2672 ; CHECK-NEXT: vpsel q0, q0, q4
2673 ; CHECK-NEXT: vmov lr, r12, d1
2674 ; CHECK-NEXT: vmov r3, r2, d0
2675 ; CHECK-NEXT: adds.w lr, lr, r3
2676 ; CHECK-NEXT: adc.w r12, r12, r2
2677 ; CHECK-NEXT: vmov r2, r3, d15
2678 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
2679 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
2680 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
2681 ; CHECK-NEXT: vmov.s8 r2, q1[3]
2682 ; CHECK-NEXT: vmov.s8 r3, q3[3]
2683 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2684 ; CHECK-NEXT: smull r2, r3, r3, r2
2685 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
2686 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r3
2687 ; CHECK-NEXT: vpsel q0, q0, q4
2688 ; CHECK-NEXT: vmov r2, r3, d0
2689 ; CHECK-NEXT: vmov r5, r4, d1
2690 ; CHECK-NEXT: adds.w r2, r2, lr
2691 ; CHECK-NEXT: adc.w r3, r3, r12
2692 ; CHECK-NEXT: adds.w r12, r2, r5
2693 ; CHECK-NEXT: adc.w lr, r3, r4
2694 ; CHECK-NEXT: vmov.u16 r5, q6[6]
2695 ; CHECK-NEXT: vmov.u16 r4, q6[4]
2696 ; CHECK-NEXT: vmov.s8 r2, q1[4]
2697 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2698 ; CHECK-NEXT: vmov.u16 r5, q6[7]
2699 ; CHECK-NEXT: vmov.u16 r4, q6[5]
2700 ; CHECK-NEXT: vmov.s8 r3, q3[4]
2701 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2702 ; CHECK-NEXT: smull r2, r3, r3, r2
2703 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2704 ; CHECK-NEXT: vpsel q6, q2, q7
2705 ; CHECK-NEXT: vmov r5, r4, d12
2706 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2707 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2708 ; CHECK-NEXT: vmov.s8 r5, q1[5]
2709 ; CHECK-NEXT: vmov.s8 r4, q3[5]
2710 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2711 ; CHECK-NEXT: smull r5, r4, r4, r5
2712 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2713 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2714 ; CHECK-NEXT: vpsel q0, q0, q4
2715 ; CHECK-NEXT: vmov r2, r3, d0
2716 ; CHECK-NEXT: vmov r5, r4, d1
2717 ; CHECK-NEXT: adds.w r2, r2, r12
2718 ; CHECK-NEXT: adc.w r3, r3, lr
2719 ; CHECK-NEXT: adds.w r12, r2, r5
2720 ; CHECK-NEXT: adc.w lr, r3, r4
2721 ; CHECK-NEXT: vmov r5, r4, d13
2722 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2723 ; CHECK-NEXT: vmov.s8 r2, q1[6]
2724 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2725 ; CHECK-NEXT: vmov.s8 r3, q3[6]
2726 ; CHECK-NEXT: vmov.s8 r5, q1[7]
2727 ; CHECK-NEXT: vmov.s8 r4, q3[7]
2728 ; CHECK-NEXT: smull r5, r4, r4, r5
2729 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2730 ; CHECK-NEXT: smull r2, r3, r3, r2
2731 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2732 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2733 ; CHECK-NEXT: vpsel q0, q0, q4
2734 ; CHECK-NEXT: vmov r2, r3, d0
2735 ; CHECK-NEXT: vmov r5, r4, d1
2736 ; CHECK-NEXT: adds.w r2, r2, r12
2737 ; CHECK-NEXT: adc.w r3, r3, lr
2738 ; CHECK-NEXT: adds.w r12, r2, r5
2739 ; CHECK-NEXT: vmov.u8 r5, q5[8]
2740 ; CHECK-NEXT: adc.w lr, r3, r4
2741 ; CHECK-NEXT: vmov.16 q6[0], r5
2742 ; CHECK-NEXT: vmov.u8 r5, q5[9]
2743 ; CHECK-NEXT: vmov.16 q6[1], r5
2744 ; CHECK-NEXT: vmov.u8 r5, q5[10]
2745 ; CHECK-NEXT: vmov.16 q6[2], r5
2746 ; CHECK-NEXT: vmov.u8 r5, q5[11]
2747 ; CHECK-NEXT: vmov.16 q6[3], r5
2748 ; CHECK-NEXT: vmov.u8 r5, q5[12]
2749 ; CHECK-NEXT: vmov.16 q6[4], r5
2750 ; CHECK-NEXT: vmov.u8 r5, q5[13]
2751 ; CHECK-NEXT: vmov.16 q6[5], r5
2752 ; CHECK-NEXT: vmov.u8 r5, q5[14]
2753 ; CHECK-NEXT: vmov.16 q6[6], r5
2754 ; CHECK-NEXT: vmov.u8 r5, q5[15]
2755 ; CHECK-NEXT: vmov.16 q6[7], r5
2756 ; CHECK-NEXT: vmov.s8 r2, q1[8]
2757 ; CHECK-NEXT: vcmp.i16 ne, q6, zr
2758 ; CHECK-NEXT: vmov.s8 r3, q3[8]
2759 ; CHECK-NEXT: vpsel q5, q2, q7
2760 ; CHECK-NEXT: smull r2, r3, r3, r2
2761 ; CHECK-NEXT: vmov.u16 r5, q5[2]
2762 ; CHECK-NEXT: vmov.u16 r4, q5[0]
2763 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2764 ; CHECK-NEXT: vmov.u16 r5, q5[3]
2765 ; CHECK-NEXT: vmov.u16 r4, q5[1]
2766 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2767 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2768 ; CHECK-NEXT: vpsel q6, q2, q7
2769 ; CHECK-NEXT: vmov r5, r4, d12
2770 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2771 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2772 ; CHECK-NEXT: vmov.s8 r5, q1[9]
2773 ; CHECK-NEXT: vmov.s8 r4, q3[9]
2774 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2775 ; CHECK-NEXT: smull r5, r4, r4, r5
2776 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2777 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2778 ; CHECK-NEXT: vpsel q0, q0, q4
2779 ; CHECK-NEXT: vmov r2, r3, d0
2780 ; CHECK-NEXT: vmov r5, r4, d1
2781 ; CHECK-NEXT: adds.w r2, r2, r12
2782 ; CHECK-NEXT: adc.w r3, r3, lr
2783 ; CHECK-NEXT: adds.w r12, r2, r5
2784 ; CHECK-NEXT: adc.w lr, r3, r4
2785 ; CHECK-NEXT: vmov r5, r4, d13
2786 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2787 ; CHECK-NEXT: vmov.s8 r2, q1[10]
2788 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2789 ; CHECK-NEXT: vmov.s8 r3, q3[10]
2790 ; CHECK-NEXT: vmov.s8 r5, q1[11]
2791 ; CHECK-NEXT: vmov.s8 r4, q3[11]
2792 ; CHECK-NEXT: smull r5, r4, r4, r5
2793 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2794 ; CHECK-NEXT: smull r2, r3, r3, r2
2795 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2796 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2797 ; CHECK-NEXT: vpsel q0, q0, q4
2798 ; CHECK-NEXT: vmov r2, r3, d0
2799 ; CHECK-NEXT: vmov r5, r4, d1
2800 ; CHECK-NEXT: adds.w r2, r2, r12
2801 ; CHECK-NEXT: adc.w r3, r3, lr
2802 ; CHECK-NEXT: adds.w r12, r2, r5
2803 ; CHECK-NEXT: adc.w lr, r3, r4
2804 ; CHECK-NEXT: vmov.u16 r5, q5[6]
2805 ; CHECK-NEXT: vmov.u16 r4, q5[4]
2806 ; CHECK-NEXT: vmov.s8 r2, q1[12]
2807 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
2808 ; CHECK-NEXT: vmov.u16 r5, q5[7]
2809 ; CHECK-NEXT: vmov.u16 r4, q5[5]
2810 ; CHECK-NEXT: vmov.s8 r3, q3[12]
2811 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2812 ; CHECK-NEXT: smull r2, r3, r3, r2
2813 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2814 ; CHECK-NEXT: vpsel q2, q2, q7
2815 ; CHECK-NEXT: vmov r5, r4, d4
2816 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2817 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2818 ; CHECK-NEXT: vmov.s8 r5, q1[13]
2819 ; CHECK-NEXT: vmov.s8 r4, q3[13]
2820 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2821 ; CHECK-NEXT: smull r5, r4, r4, r5
2822 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2823 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2824 ; CHECK-NEXT: vpsel q0, q0, q4
2825 ; CHECK-NEXT: vmov r2, r3, d0
2826 ; CHECK-NEXT: vmov r5, r4, d1
2827 ; CHECK-NEXT: adds.w r2, r2, r12
2828 ; CHECK-NEXT: adc.w r3, r3, lr
2829 ; CHECK-NEXT: adds.w r12, r2, r5
2830 ; CHECK-NEXT: adc.w lr, r3, r4
2831 ; CHECK-NEXT: vmov r5, r4, d5
2832 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
2833 ; CHECK-NEXT: vmov.s8 r2, q1[14]
2834 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
2835 ; CHECK-NEXT: vmov.s8 r3, q3[14]
2836 ; CHECK-NEXT: vmov.s8 r5, q1[15]
2837 ; CHECK-NEXT: vmov.s8 r4, q3[15]
2838 ; CHECK-NEXT: smull r5, r4, r4, r5
2839 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2840 ; CHECK-NEXT: smull r2, r3, r3, r2
2841 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r5
2842 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
2843 ; CHECK-NEXT: vpsel q0, q0, q4
2844 ; CHECK-NEXT: vmov r2, r3, d0
2845 ; CHECK-NEXT: vmov r5, r4, d1
2846 ; CHECK-NEXT: adds.w r2, r2, r12
2847 ; CHECK-NEXT: adc.w r3, r3, lr
2848 ; CHECK-NEXT: adds r2, r2, r5
2849 ; CHECK-NEXT: adcs r3, r4
2850 ; CHECK-NEXT: adds r0, r0, r2
2851 ; CHECK-NEXT: adcs r1, r3
2852 ; CHECK-NEXT: add sp, #16
2853 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2854 ; CHECK-NEXT: pop {r4, r5, r7, pc}
2856 %c = icmp eq <16 x i8> %b, zeroinitializer
2857 %xx = sext <16 x i8> %x to <16 x i64>
2858 %yy = sext <16 x i8> %y to <16 x i64>
2859 %m = mul <16 x i64> %xx, %yy
2860 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2861 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2866 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2867 ; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
2868 ; CHECK: @ %bb.0: @ %entry
2869 ; CHECK-NEXT: .save {r7, lr}
2870 ; CHECK-NEXT: push {r7, lr}
2871 ; CHECK-NEXT: vmov.i64 q3, #0xff
2872 ; CHECK-NEXT: vand q1, q1, q3
2873 ; CHECK-NEXT: vand q0, q0, q3
2874 ; CHECK-NEXT: vmov r2, s6
2875 ; CHECK-NEXT: vmov r3, s2
2876 ; CHECK-NEXT: umull lr, r12, r3, r2
2877 ; CHECK-NEXT: vmov r3, s4
2878 ; CHECK-NEXT: vmov r2, s0
2879 ; CHECK-NEXT: vand q1, q2, q3
2880 ; CHECK-NEXT: umull r2, r3, r2, r3
2881 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2882 ; CHECK-NEXT: vmov r2, s4
2883 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2884 ; CHECK-NEXT: movs r3, #0
2885 ; CHECK-NEXT: cmp r2, #0
2886 ; CHECK-NEXT: csetm r2, eq
2887 ; CHECK-NEXT: bfi r3, r2, #0, #8
2888 ; CHECK-NEXT: vmov r2, s6
2889 ; CHECK-NEXT: vmov.i32 q1, #0x0
2890 ; CHECK-NEXT: cmp r2, #0
2891 ; CHECK-NEXT: csetm r2, eq
2892 ; CHECK-NEXT: bfi r3, r2, #8, #8
2893 ; CHECK-NEXT: vmsr p0, r3
2894 ; CHECK-NEXT: vpsel q0, q0, q1
2895 ; CHECK-NEXT: vmov lr, r12, d1
2896 ; CHECK-NEXT: vmov r3, r2, d0
2897 ; CHECK-NEXT: adds.w r3, r3, lr
2898 ; CHECK-NEXT: adc.w r2, r2, r12
2899 ; CHECK-NEXT: adds r0, r0, r3
2900 ; CHECK-NEXT: adcs r1, r2
2901 ; CHECK-NEXT: pop {r7, pc}
2903 %c = icmp eq <2 x i8> %b, zeroinitializer
2904 %xx = zext <2 x i8> %x to <2 x i64>
2905 %yy = zext <2 x i8> %y to <2 x i64>
2906 %m = mul <2 x i64> %xx, %yy
2907 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2908 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2913 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2914 ; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
2915 ; CHECK: @ %bb.0: @ %entry
2916 ; CHECK-NEXT: .save {r7, lr}
2917 ; CHECK-NEXT: push {r7, lr}
2918 ; CHECK-NEXT: vmov.i32 q3, #0xff
2919 ; CHECK-NEXT: movs r3, #0
2920 ; CHECK-NEXT: vand q2, q2, q3
2921 ; CHECK-NEXT: vmov r2, s8
2922 ; CHECK-NEXT: cmp r2, #0
2923 ; CHECK-NEXT: csetm r2, eq
2924 ; CHECK-NEXT: bfi r3, r2, #0, #8
2925 ; CHECK-NEXT: vmov r2, s10
2926 ; CHECK-NEXT: cmp r2, #0
2927 ; CHECK-NEXT: csetm r2, eq
2928 ; CHECK-NEXT: bfi r3, r2, #8, #8
2929 ; CHECK-NEXT: vmov r2, s6
2930 ; CHECK-NEXT: vmsr p0, r3
2931 ; CHECK-NEXT: vmov r3, s2
2932 ; CHECK-NEXT: sxtb r2, r2
2933 ; CHECK-NEXT: sxtb r3, r3
2934 ; CHECK-NEXT: smull lr, r12, r3, r2
2935 ; CHECK-NEXT: vmov r3, s4
2936 ; CHECK-NEXT: vmov r2, s0
2937 ; CHECK-NEXT: vmov.i32 q1, #0x0
2938 ; CHECK-NEXT: sxtb r3, r3
2939 ; CHECK-NEXT: sxtb r2, r2
2940 ; CHECK-NEXT: smull r2, r3, r2, r3
2941 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2942 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2943 ; CHECK-NEXT: vpsel q0, q0, q1
2944 ; CHECK-NEXT: vmov lr, r12, d1
2945 ; CHECK-NEXT: vmov r3, r2, d0
2946 ; CHECK-NEXT: adds.w r3, r3, lr
2947 ; CHECK-NEXT: adc.w r2, r2, r12
2948 ; CHECK-NEXT: adds r0, r0, r3
2949 ; CHECK-NEXT: adcs r1, r2
2950 ; CHECK-NEXT: pop {r7, pc}
2952 %c = icmp eq <2 x i8> %b, zeroinitializer
2953 %xx = sext <2 x i8> %x to <2 x i64>
2954 %yy = sext <2 x i8> %y to <2 x i64>
2955 %m = mul <2 x i64> %xx, %yy
2956 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2957 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2962 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
2963 ; CHECK-LABEL: add_v2i64_v2i64_acc:
2964 ; CHECK: @ %bb.0: @ %entry
2965 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2966 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2967 ; CHECK-NEXT: vmov r2, r12, d3
2968 ; CHECK-NEXT: vmov r3, lr, d1
2969 ; CHECK-NEXT: vmov r6, r9, d2
2970 ; CHECK-NEXT: vmov.i32 q1, #0x0
2971 ; CHECK-NEXT: vmov r5, r11, d0
2972 ; CHECK-NEXT: umull r10, r8, r3, r2
2973 ; CHECK-NEXT: umull r4, r7, r5, r6
2974 ; CHECK-NEXT: mla r3, r3, r12, r8
2975 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r10
2976 ; CHECK-NEXT: mla r2, lr, r2, r3
2977 ; CHECK-NEXT: mla r3, r5, r9, r7
2978 ; CHECK-NEXT: mla r3, r11, r6, r3
2979 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
2980 ; CHECK-NEXT: vmov r2, r3, d4
2981 ; CHECK-NEXT: orrs r2, r3
2982 ; CHECK-NEXT: mov.w r3, #0
2983 ; CHECK-NEXT: csetm r2, eq
2984 ; CHECK-NEXT: bfi r3, r2, #0, #8
2985 ; CHECK-NEXT: vmov r2, r7, d5
2986 ; CHECK-NEXT: orrs r2, r7
2987 ; CHECK-NEXT: csetm r2, eq
2988 ; CHECK-NEXT: bfi r3, r2, #8, #8
2989 ; CHECK-NEXT: vmsr p0, r3
2990 ; CHECK-NEXT: vpsel q0, q0, q1
2991 ; CHECK-NEXT: vmov r2, r3, d1
2992 ; CHECK-NEXT: vmov r7, r6, d0
2993 ; CHECK-NEXT: adds r2, r2, r7
2994 ; CHECK-NEXT: adcs r3, r6
2995 ; CHECK-NEXT: adds r0, r0, r2
2996 ; CHECK-NEXT: adcs r1, r3
2997 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2999 %c = icmp eq <2 x i64> %b, zeroinitializer
3000 %m = mul <2 x i64> %x, %y
3001 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3002 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3007 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3008 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3009 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3010 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3011 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3012 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
3013 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
3014 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
3015 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
3016 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)