1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
5 ; CHECK-LABEL: add_v4i32_v4i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vpt.i32 eq, q2, zr
8 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
11 %c = icmp eq <4 x i32> %b, zeroinitializer
12 %m = mul <4 x i32> %x, %y
13 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
14 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
18 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
19 ; CHECK-LABEL: add_v4i32_v4i64_zext:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vpt.i32 eq, q2, zr
22 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
25 %c = icmp eq <4 x i32> %b, zeroinitializer
26 %xx = zext <4 x i32> %x to <4 x i64>
27 %yy = zext <4 x i32> %y to <4 x i64>
28 %m = mul <4 x i64> %xx, %yy
29 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
30 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
34 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
35 ; CHECK-LABEL: add_v4i32_v4i64_sext:
36 ; CHECK: @ %bb.0: @ %entry
37 ; CHECK-NEXT: vpt.i32 eq, q2, zr
38 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
41 %c = icmp eq <4 x i32> %b, zeroinitializer
42 %xx = sext <4 x i32> %x to <4 x i64>
43 %yy = sext <4 x i32> %y to <4 x i64>
44 %m = mul <4 x i64> %xx, %yy
45 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
46 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
50 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
51 ; CHECK-LABEL: add_v2i32_v2i64_zext:
52 ; CHECK: @ %bb.0: @ %entry
53 ; CHECK-NEXT: vmov r0, s10
54 ; CHECK-NEXT: vmullb.u32 q3, q0, q1
55 ; CHECK-NEXT: vmov r1, s8
56 ; CHECK-NEXT: cmp r0, #0
57 ; CHECK-NEXT: cset r0, eq
58 ; CHECK-NEXT: cmp r0, #0
59 ; CHECK-NEXT: csetm r0, ne
60 ; CHECK-NEXT: cmp r1, #0
61 ; CHECK-NEXT: cset r1, eq
62 ; CHECK-NEXT: cmp r1, #0
63 ; CHECK-NEXT: csetm r1, ne
64 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
65 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
66 ; CHECK-NEXT: vand q0, q3, q0
67 ; CHECK-NEXT: vmov r0, r1, d1
68 ; CHECK-NEXT: vmov r2, r3, d0
69 ; CHECK-NEXT: adds r0, r0, r2
70 ; CHECK-NEXT: adcs r1, r3
73 %c = icmp eq <2 x i32> %b, zeroinitializer
74 %xx = zext <2 x i32> %x to <2 x i64>
75 %yy = zext <2 x i32> %y to <2 x i64>
76 %m = mul <2 x i64> %xx, %yy
77 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
78 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
82 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
83 ; CHECK-LABEL: add_v2i32_v2i64_sext:
84 ; CHECK: @ %bb.0: @ %entry
85 ; CHECK-NEXT: vmov r0, s10
86 ; CHECK-NEXT: vmullb.s32 q3, q0, q1
87 ; CHECK-NEXT: vmov r1, s8
88 ; CHECK-NEXT: cmp r0, #0
89 ; CHECK-NEXT: cset r0, eq
90 ; CHECK-NEXT: cmp r0, #0
91 ; CHECK-NEXT: csetm r0, ne
92 ; CHECK-NEXT: cmp r1, #0
93 ; CHECK-NEXT: cset r1, eq
94 ; CHECK-NEXT: cmp r1, #0
95 ; CHECK-NEXT: csetm r1, ne
96 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
97 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
98 ; CHECK-NEXT: vand q0, q3, q0
99 ; CHECK-NEXT: vmov r0, r1, d1
100 ; CHECK-NEXT: vmov r2, r3, d0
101 ; CHECK-NEXT: adds r0, r0, r2
102 ; CHECK-NEXT: adcs r1, r3
105 %c = icmp eq <2 x i32> %b, zeroinitializer
106 %xx = sext <2 x i32> %x to <2 x i64>
107 %yy = sext <2 x i32> %y to <2 x i64>
108 %m = mul <2 x i64> %xx, %yy
109 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
110 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
114 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
115 ; CHECK-LABEL: add_v8i16_v8i32_zext:
116 ; CHECK: @ %bb.0: @ %entry
117 ; CHECK-NEXT: vpt.i16 eq, q2, zr
118 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
121 %c = icmp eq <8 x i16> %b, zeroinitializer
122 %xx = zext <8 x i16> %x to <8 x i32>
123 %yy = zext <8 x i16> %y to <8 x i32>
124 %m = mul <8 x i32> %xx, %yy
125 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
126 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
130 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
131 ; CHECK-LABEL: add_v8i16_v8i32_sext:
132 ; CHECK: @ %bb.0: @ %entry
133 ; CHECK-NEXT: vpt.i16 eq, q2, zr
134 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
137 %c = icmp eq <8 x i16> %b, zeroinitializer
138 %xx = sext <8 x i16> %x to <8 x i32>
139 %yy = sext <8 x i16> %y to <8 x i32>
140 %m = mul <8 x i32> %xx, %yy
141 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
142 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
146 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
147 ; CHECK-LABEL: add_v4i16_v4i32_zext:
148 ; CHECK: @ %bb.0: @ %entry
149 ; CHECK-NEXT: vmovlb.u16 q1, q1
150 ; CHECK-NEXT: vmovlb.u16 q0, q0
151 ; CHECK-NEXT: vmovlb.u16 q2, q2
152 ; CHECK-NEXT: vpt.i32 eq, q2, zr
153 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
156 %c = icmp eq <4 x i16> %b, zeroinitializer
157 %xx = zext <4 x i16> %x to <4 x i32>
158 %yy = zext <4 x i16> %y to <4 x i32>
159 %m = mul <4 x i32> %xx, %yy
160 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
161 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
165 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
166 ; CHECK-LABEL: add_v4i16_v4i32_sext:
167 ; CHECK: @ %bb.0: @ %entry
168 ; CHECK-NEXT: vmovlb.s16 q1, q1
169 ; CHECK-NEXT: vmovlb.s16 q0, q0
170 ; CHECK-NEXT: vmovlb.u16 q2, q2
171 ; CHECK-NEXT: vpt.i32 eq, q2, zr
172 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
175 %c = icmp eq <4 x i16> %b, zeroinitializer
176 %xx = sext <4 x i16> %x to <4 x i32>
177 %yy = sext <4 x i16> %y to <4 x i32>
178 %m = mul <4 x i32> %xx, %yy
179 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
180 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
184 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
185 ; CHECK-LABEL: add_v8i16_v8i16:
186 ; CHECK: @ %bb.0: @ %entry
187 ; CHECK-NEXT: vpt.i16 eq, q2, zr
188 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
189 ; CHECK-NEXT: uxth r0, r0
192 %c = icmp eq <8 x i16> %b, zeroinitializer
193 %m = mul <8 x i16> %x, %y
194 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
195 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
199 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
200 ; CHECK-LABEL: add_v8i16_v8i64_zext:
201 ; CHECK: @ %bb.0: @ %entry
202 ; CHECK-NEXT: vpt.i16 eq, q2, zr
203 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
206 %c = icmp eq <8 x i16> %b, zeroinitializer
207 %xx = zext <8 x i16> %x to <8 x i64>
208 %yy = zext <8 x i16> %y to <8 x i64>
209 %m = mul <8 x i64> %xx, %yy
210 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
211 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
215 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
216 ; CHECK-LABEL: add_v8i16_v8i64_sext:
217 ; CHECK: @ %bb.0: @ %entry
218 ; CHECK-NEXT: vpt.i16 eq, q2, zr
219 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
222 %c = icmp eq <8 x i16> %b, zeroinitializer
223 %xx = sext <8 x i16> %x to <8 x i64>
224 %yy = sext <8 x i16> %y to <8 x i64>
225 %m = mul <8 x i64> %xx, %yy
226 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
227 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
231 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
232 ; CHECK-LABEL: add_v8i8i16_v8i64_zext:
233 ; CHECK: @ %bb.0: @ %entry
234 ; CHECK-NEXT: vmovlb.u8 q1, q1
235 ; CHECK-NEXT: vpt.i16 eq, q2, zr
236 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
239 %c = icmp eq <8 x i16> %b, zeroinitializer
240 %xx = zext <8 x i16> %x to <8 x i64>
241 %yy = zext <8 x i8> %y to <8 x i64>
242 %m = mul <8 x i64> %xx, %yy
243 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
244 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
248 define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
249 ; CHECK-LABEL: add_v8i8i16_v8i64_sext:
250 ; CHECK: @ %bb.0: @ %entry
251 ; CHECK-NEXT: vmovlb.s8 q1, q1
252 ; CHECK-NEXT: vpt.i16 eq, q2, zr
253 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
256 %c = icmp eq <8 x i16> %b, zeroinitializer
257 %xx = sext <8 x i16> %x to <8 x i64>
258 %yy = sext <8 x i8> %y to <8 x i64>
259 %m = mul <8 x i64> %xx, %yy
260 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
261 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
265 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
266 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
267 ; CHECK: @ %bb.0: @ %entry
268 ; CHECK-NEXT: vpt.i16 eq, q2, zr
269 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
272 %c = icmp eq <8 x i16> %b, zeroinitializer
273 %xx = zext <8 x i16> %x to <8 x i32>
274 %yy = zext <8 x i16> %y to <8 x i32>
275 %m = mul <8 x i32> %xx, %yy
276 %ma = zext <8 x i32> %m to <8 x i64>
277 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
278 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
282 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
283 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
284 ; CHECK: @ %bb.0: @ %entry
285 ; CHECK-NEXT: vpt.i16 eq, q2, zr
286 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
289 %c = icmp eq <8 x i16> %b, zeroinitializer
290 %xx = sext <8 x i16> %x to <8 x i32>
291 %yy = sext <8 x i16> %y to <8 x i32>
292 %m = mul <8 x i32> %xx, %yy
293 %ma = sext <8 x i32> %m to <8 x i64>
294 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
295 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
299 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
300 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
301 ; CHECK: @ %bb.0: @ %entry
302 ; CHECK-NEXT: vpt.i16 eq, q2, zr
303 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0
306 %c = icmp eq <8 x i16> %b, zeroinitializer
307 %xx = sext <8 x i16> %x to <8 x i32>
308 %m = mul <8 x i32> %xx, %xx
309 %ma = zext <8 x i32> %m to <8 x i64>
310 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
311 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
315 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
316 ; CHECK-LABEL: add_v4i16_v4i64_zext:
317 ; CHECK: @ %bb.0: @ %entry
318 ; CHECK-NEXT: vmovlb.u16 q1, q1
319 ; CHECK-NEXT: vmovlb.u16 q0, q0
320 ; CHECK-NEXT: vmovlb.u16 q2, q2
321 ; CHECK-NEXT: vpt.i32 eq, q2, zr
322 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
325 %c = icmp eq <4 x i16> %b, zeroinitializer
326 %xx = zext <4 x i16> %x to <4 x i64>
327 %yy = zext <4 x i16> %y to <4 x i64>
328 %m = mul <4 x i64> %xx, %yy
329 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
330 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
334 define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
335 ; CHECK-LABEL: add_v4i16_v4i64_sext:
336 ; CHECK: @ %bb.0: @ %entry
337 ; CHECK-NEXT: vmovlb.s16 q1, q1
338 ; CHECK-NEXT: vmovlb.s16 q0, q0
339 ; CHECK-NEXT: vmovlb.u16 q2, q2
340 ; CHECK-NEXT: vpt.i32 eq, q2, zr
341 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
344 %c = icmp eq <4 x i16> %b, zeroinitializer
345 %xx = sext <4 x i16> %x to <4 x i64>
346 %yy = sext <4 x i16> %y to <4 x i64>
347 %m = mul <4 x i64> %xx, %yy
348 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
349 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
353 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
354 ; CHECK-LABEL: add_v2i16_v2i64_zext:
355 ; CHECK: @ %bb.0: @ %entry
356 ; CHECK-NEXT: vmov.i64 q3, #0xffff
357 ; CHECK-NEXT: vand q1, q1, q3
358 ; CHECK-NEXT: vand q0, q0, q3
359 ; CHECK-NEXT: vmov r0, s6
360 ; CHECK-NEXT: vmov r1, s2
361 ; CHECK-NEXT: vmov r2, s4
362 ; CHECK-NEXT: vand q1, q2, q3
363 ; CHECK-NEXT: vmov r3, s0
364 ; CHECK-NEXT: umull r0, r1, r1, r0
365 ; CHECK-NEXT: umull r2, r3, r3, r2
366 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
367 ; CHECK-NEXT: vmov r0, s6
368 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
369 ; CHECK-NEXT: vmov r1, s4
370 ; CHECK-NEXT: cmp r0, #0
371 ; CHECK-NEXT: cset r0, eq
372 ; CHECK-NEXT: cmp r0, #0
373 ; CHECK-NEXT: csetm r0, ne
374 ; CHECK-NEXT: cmp r1, #0
375 ; CHECK-NEXT: cset r1, eq
376 ; CHECK-NEXT: cmp r1, #0
377 ; CHECK-NEXT: csetm r1, ne
378 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
379 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
380 ; CHECK-NEXT: vand q0, q0, q1
381 ; CHECK-NEXT: vmov r0, r1, d1
382 ; CHECK-NEXT: vmov r2, r3, d0
383 ; CHECK-NEXT: adds r0, r0, r2
384 ; CHECK-NEXT: adcs r1, r3
387 %c = icmp eq <2 x i16> %b, zeroinitializer
388 %xx = zext <2 x i16> %x to <2 x i64>
389 %yy = zext <2 x i16> %y to <2 x i64>
390 %m = mul <2 x i64> %xx, %yy
391 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
392 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
396 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
397 ; CHECK-LABEL: add_v2i16_v2i64_sext:
398 ; CHECK: @ %bb.0: @ %entry
399 ; CHECK-NEXT: vmov.i32 q3, #0xffff
400 ; CHECK-NEXT: vmov r2, s4
401 ; CHECK-NEXT: vand q2, q2, q3
402 ; CHECK-NEXT: vmov r3, s0
403 ; CHECK-NEXT: vmov r0, s10
404 ; CHECK-NEXT: vmov r1, s8
405 ; CHECK-NEXT: sxth r2, r2
406 ; CHECK-NEXT: cmp r0, #0
407 ; CHECK-NEXT: sxth r3, r3
408 ; CHECK-NEXT: cset r0, eq
409 ; CHECK-NEXT: smull r2, r3, r3, r2
410 ; CHECK-NEXT: cmp r0, #0
411 ; CHECK-NEXT: csetm r0, ne
412 ; CHECK-NEXT: cmp r1, #0
413 ; CHECK-NEXT: cset r1, eq
414 ; CHECK-NEXT: cmp r1, #0
415 ; CHECK-NEXT: csetm r1, ne
416 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
417 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
418 ; CHECK-NEXT: vmov r0, s6
419 ; CHECK-NEXT: vmov r1, s2
420 ; CHECK-NEXT: sxth r0, r0
421 ; CHECK-NEXT: sxth r1, r1
422 ; CHECK-NEXT: smull r0, r1, r1, r0
423 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
424 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
425 ; CHECK-NEXT: vand q0, q0, q2
426 ; CHECK-NEXT: vmov r0, r1, d1
427 ; CHECK-NEXT: vmov r2, r3, d0
428 ; CHECK-NEXT: adds r0, r0, r2
429 ; CHECK-NEXT: adcs r1, r3
432 %c = icmp eq <2 x i16> %b, zeroinitializer
433 %xx = sext <2 x i16> %x to <2 x i64>
434 %yy = sext <2 x i16> %y to <2 x i64>
435 %m = mul <2 x i64> %xx, %yy
436 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
437 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
441 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
442 ; CHECK-LABEL: add_v16i8_v16i32_zext:
443 ; CHECK: @ %bb.0: @ %entry
444 ; CHECK-NEXT: vpt.i8 eq, q2, zr
445 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
448 %c = icmp eq <16 x i8> %b, zeroinitializer
449 %xx = zext <16 x i8> %x to <16 x i32>
450 %yy = zext <16 x i8> %y to <16 x i32>
451 %m = mul <16 x i32> %xx, %yy
452 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
453 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
457 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
458 ; CHECK-LABEL: add_v16i8_v16i32_sext:
459 ; CHECK: @ %bb.0: @ %entry
460 ; CHECK-NEXT: vpt.i8 eq, q2, zr
461 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
464 %c = icmp eq <16 x i8> %b, zeroinitializer
465 %xx = sext <16 x i8> %x to <16 x i32>
466 %yy = sext <16 x i8> %y to <16 x i32>
467 %m = mul <16 x i32> %xx, %yy
468 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
469 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
473 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
474 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
475 ; CHECK: @ %bb.0: @ %entry
476 ; CHECK-NEXT: vpt.i8 eq, q2, zr
477 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
480 %c = icmp eq <16 x i8> %b, zeroinitializer
481 %xx = zext <16 x i8> %x to <16 x i16>
482 %yy = zext <16 x i8> %y to <16 x i16>
483 %m = mul <16 x i16> %xx, %yy
484 %ma = zext <16 x i16> %m to <16 x i32>
485 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
486 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
490 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
491 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
492 ; CHECK: @ %bb.0: @ %entry
493 ; CHECK-NEXT: vpt.i8 eq, q2, zr
494 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
497 %c = icmp eq <16 x i8> %b, zeroinitializer
498 %xx = sext <16 x i8> %x to <16 x i16>
499 %yy = sext <16 x i8> %y to <16 x i16>
500 %m = mul <16 x i16> %xx, %yy
501 %ma = sext <16 x i16> %m to <16 x i32>
502 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
503 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
507 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
508 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
509 ; CHECK: @ %bb.0: @ %entry
510 ; CHECK-NEXT: vpt.i8 eq, q2, zr
511 ; CHECK-NEXT: vmlavt.s8 r0, q0, q0
514 %c = icmp eq <16 x i8> %b, zeroinitializer
515 %xx = sext <16 x i8> %x to <16 x i16>
516 %m = mul <16 x i16> %xx, %xx
517 %ma = zext <16 x i16> %m to <16 x i32>
518 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
519 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
523 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
524 ; CHECK-LABEL: add_v8i8_v8i32_zext:
525 ; CHECK: @ %bb.0: @ %entry
526 ; CHECK-NEXT: vmovlb.u8 q1, q1
527 ; CHECK-NEXT: vmovlb.u8 q0, q0
528 ; CHECK-NEXT: vmovlb.u8 q2, q2
529 ; CHECK-NEXT: vpt.i16 eq, q2, zr
530 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
533 %c = icmp eq <8 x i8> %b, zeroinitializer
534 %xx = zext <8 x i8> %x to <8 x i32>
535 %yy = zext <8 x i8> %y to <8 x i32>
536 %m = mul <8 x i32> %xx, %yy
537 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
538 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
542 define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
543 ; CHECK-LABEL: add_v8i8_v8i32_sext:
544 ; CHECK: @ %bb.0: @ %entry
545 ; CHECK-NEXT: vmovlb.s8 q1, q1
546 ; CHECK-NEXT: vmovlb.s8 q0, q0
547 ; CHECK-NEXT: vmovlb.u8 q2, q2
548 ; CHECK-NEXT: vpt.i16 eq, q2, zr
549 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
552 %c = icmp eq <8 x i8> %b, zeroinitializer
553 %xx = sext <8 x i8> %x to <8 x i32>
554 %yy = sext <8 x i8> %y to <8 x i32>
555 %m = mul <8 x i32> %xx, %yy
556 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
557 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
561 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
562 ; CHECK-LABEL: add_v8i8i16_v8i32_zext:
563 ; CHECK: @ %bb.0: @ %entry
564 ; CHECK-NEXT: vmovlb.u8 q0, q0
565 ; CHECK-NEXT: vmovlb.u8 q2, q2
566 ; CHECK-NEXT: vpt.i16 eq, q2, zr
567 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
570 %c = icmp eq <8 x i8> %b, zeroinitializer
571 %xx = zext <8 x i8> %x to <8 x i32>
572 %yy = zext <8 x i16> %y to <8 x i32>
573 %m = mul <8 x i32> %xx, %yy
574 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
575 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
579 define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
580 ; CHECK-LABEL: add_v8i8i16_v8i32_sext:
581 ; CHECK: @ %bb.0: @ %entry
582 ; CHECK-NEXT: vmovlb.s8 q0, q0
583 ; CHECK-NEXT: vmovlb.u8 q2, q2
584 ; CHECK-NEXT: vpt.i16 eq, q2, zr
585 ; CHECK-NEXT: vmlavt.s16 r0, q0, q1
588 %c = icmp eq <8 x i8> %b, zeroinitializer
589 %xx = sext <8 x i8> %x to <8 x i32>
590 %yy = sext <8 x i16> %y to <8 x i32>
591 %m = mul <8 x i32> %xx, %yy
592 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
593 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
597 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
598 ; CHECK-LABEL: add_v4i8_v4i32_zext:
599 ; CHECK: @ %bb.0: @ %entry
600 ; CHECK-NEXT: vmov.i32 q3, #0xff
601 ; CHECK-NEXT: vand q1, q1, q3
602 ; CHECK-NEXT: vand q0, q0, q3
603 ; CHECK-NEXT: vand q2, q2, q3
604 ; CHECK-NEXT: vpt.i32 eq, q2, zr
605 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
608 %c = icmp eq <4 x i8> %b, zeroinitializer
609 %xx = zext <4 x i8> %x to <4 x i32>
610 %yy = zext <4 x i8> %y to <4 x i32>
611 %m = mul <4 x i32> %xx, %yy
612 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
613 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
617 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
618 ; CHECK-LABEL: add_v4i8_v4i32_sext:
619 ; CHECK: @ %bb.0: @ %entry
620 ; CHECK-NEXT: vmovlb.s8 q1, q1
621 ; CHECK-NEXT: vmovlb.s8 q0, q0
622 ; CHECK-NEXT: vmov.i32 q3, #0xff
623 ; CHECK-NEXT: vmovlb.s16 q1, q1
624 ; CHECK-NEXT: vand q2, q2, q3
625 ; CHECK-NEXT: vmovlb.s16 q0, q0
626 ; CHECK-NEXT: vpt.i32 eq, q2, zr
627 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
630 %c = icmp eq <4 x i8> %b, zeroinitializer
631 %xx = sext <4 x i8> %x to <4 x i32>
632 %yy = sext <4 x i8> %y to <4 x i32>
633 %m = mul <4 x i32> %xx, %yy
634 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
635 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
639 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
640 ; CHECK-LABEL: add_v4i8_v4i32_szext:
641 ; CHECK: @ %bb.0: @ %entry
642 ; CHECK-NEXT: vmov.i32 q3, #0xff
643 ; CHECK-NEXT: vmovlb.s8 q0, q0
644 ; CHECK-NEXT: vand q1, q1, q3
645 ; CHECK-NEXT: vand q2, q2, q3
646 ; CHECK-NEXT: vmovlb.s16 q0, q0
647 ; CHECK-NEXT: vpt.i32 eq, q2, zr
648 ; CHECK-NEXT: vmlavt.u32 r0, q0, q1
651 %c = icmp eq <4 x i8> %b, zeroinitializer
652 %xx = sext <4 x i8> %x to <4 x i32>
653 %yy = zext <4 x i8> %y to <4 x i32>
654 %m = mul <4 x i32> %xx, %yy
655 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
656 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
660 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
661 ; CHECK-LABEL: add_v16i8_v16i16_zext:
662 ; CHECK: @ %bb.0: @ %entry
663 ; CHECK-NEXT: vpt.i8 eq, q2, zr
664 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
665 ; CHECK-NEXT: uxth r0, r0
668 %c = icmp eq <16 x i8> %b, zeroinitializer
669 %xx = zext <16 x i8> %x to <16 x i16>
670 %yy = zext <16 x i8> %y to <16 x i16>
671 %m = mul <16 x i16> %xx, %yy
672 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
673 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
677 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
678 ; CHECK-LABEL: add_v16i8_v16i16_sext:
679 ; CHECK: @ %bb.0: @ %entry
680 ; CHECK-NEXT: vpt.i8 eq, q2, zr
681 ; CHECK-NEXT: vmlavt.s8 r0, q0, q1
682 ; CHECK-NEXT: sxth r0, r0
685 %c = icmp eq <16 x i8> %b, zeroinitializer
686 %xx = sext <16 x i8> %x to <16 x i16>
687 %yy = sext <16 x i8> %y to <16 x i16>
688 %m = mul <16 x i16> %xx, %yy
689 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
690 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
694 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
695 ; CHECK-LABEL: add_v16i8_v16i16_szext:
696 ; CHECK: @ %bb.0: @ %entry
697 ; CHECK-NEXT: .pad #32
698 ; CHECK-NEXT: sub sp, #32
699 ; CHECK-NEXT: add r0, sp, #16
700 ; CHECK-NEXT: mov r1, sp
701 ; CHECK-NEXT: vstrw.32 q1, [r0]
702 ; CHECK-NEXT: vstrw.32 q0, [r1]
703 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
704 ; CHECK-NEXT: vmov.i8 q0, #0x0
705 ; CHECK-NEXT: vmov.i8 q1, #0xff
706 ; CHECK-NEXT: vldrb.s16 q2, [r1, #8]
707 ; CHECK-NEXT: vpsel q0, q1, q0
708 ; CHECK-NEXT: vmov.u8 r2, q0[8]
709 ; CHECK-NEXT: vmov.u8 r3, q0[0]
710 ; CHECK-NEXT: vmov.16 q1[0], r2
711 ; CHECK-NEXT: vmov.u8 r2, q0[9]
712 ; CHECK-NEXT: vmov.16 q1[1], r2
713 ; CHECK-NEXT: vmov.u8 r2, q0[10]
714 ; CHECK-NEXT: vmov.16 q1[2], r2
715 ; CHECK-NEXT: vmov.u8 r2, q0[11]
716 ; CHECK-NEXT: vmov.16 q1[3], r2
717 ; CHECK-NEXT: vmov.u8 r2, q0[12]
718 ; CHECK-NEXT: vmov.16 q1[4], r2
719 ; CHECK-NEXT: vmov.u8 r2, q0[13]
720 ; CHECK-NEXT: vmov.16 q1[5], r2
721 ; CHECK-NEXT: vmov.u8 r2, q0[14]
722 ; CHECK-NEXT: vmov.16 q1[6], r2
723 ; CHECK-NEXT: vmov.u8 r2, q0[15]
724 ; CHECK-NEXT: vmov.16 q1[7], r2
725 ; CHECK-NEXT: vcmp.i16 ne, q1, zr
726 ; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
728 ; CHECK-NEXT: vmlavt.u16 r2, q2, q1
729 ; CHECK-NEXT: vmov.16 q1[0], r3
730 ; CHECK-NEXT: vmov.u8 r3, q0[1]
731 ; CHECK-NEXT: vmov.16 q1[1], r3
732 ; CHECK-NEXT: vmov.u8 r3, q0[2]
733 ; CHECK-NEXT: vmov.16 q1[2], r3
734 ; CHECK-NEXT: vmov.u8 r3, q0[3]
735 ; CHECK-NEXT: vmov.16 q1[3], r3
736 ; CHECK-NEXT: vmov.u8 r3, q0[4]
737 ; CHECK-NEXT: vmov.16 q1[4], r3
738 ; CHECK-NEXT: vmov.u8 r3, q0[5]
739 ; CHECK-NEXT: vmov.16 q1[5], r3
740 ; CHECK-NEXT: vmov.u8 r3, q0[6]
741 ; CHECK-NEXT: vmov.16 q1[6], r3
742 ; CHECK-NEXT: vmov.u8 r3, q0[7]
743 ; CHECK-NEXT: vmov.16 q1[7], r3
744 ; CHECK-NEXT: vldrb.u16 q0, [r0]
745 ; CHECK-NEXT: vcmp.i16 ne, q1, zr
746 ; CHECK-NEXT: vldrb.s16 q1, [r1]
748 ; CHECK-NEXT: vmlavat.u16 r2, q1, q0
749 ; CHECK-NEXT: sxth r0, r2
750 ; CHECK-NEXT: add sp, #32
753 %c = icmp eq <16 x i8> %b, zeroinitializer
754 %xx = sext <16 x i8> %x to <16 x i16>
755 %yy = zext <16 x i8> %y to <16 x i16>
756 %m = mul <16 x i16> %xx, %yy
757 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
758 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
762 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
763 ; CHECK-LABEL: add_v8i8_v8i16_zext:
764 ; CHECK: @ %bb.0: @ %entry
765 ; CHECK-NEXT: vmovlb.u8 q1, q1
766 ; CHECK-NEXT: vmovlb.u8 q0, q0
767 ; CHECK-NEXT: vmovlb.u8 q2, q2
768 ; CHECK-NEXT: vpt.i16 eq, q2, zr
769 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
770 ; CHECK-NEXT: uxth r0, r0
773 %c = icmp eq <8 x i8> %b, zeroinitializer
774 %xx = zext <8 x i8> %x to <8 x i16>
775 %yy = zext <8 x i8> %y to <8 x i16>
776 %m = mul <8 x i16> %xx, %yy
777 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
778 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
782 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
783 ; CHECK-LABEL: add_v8i8_v8i16_sext:
784 ; CHECK: @ %bb.0: @ %entry
785 ; CHECK-NEXT: vmovlb.s8 q1, q1
786 ; CHECK-NEXT: vmovlb.s8 q0, q0
787 ; CHECK-NEXT: vmovlb.u8 q2, q2
788 ; CHECK-NEXT: vpt.i16 eq, q2, zr
789 ; CHECK-NEXT: vmlavt.u16 r0, q0, q1
790 ; CHECK-NEXT: sxth r0, r0
793 %c = icmp eq <8 x i8> %b, zeroinitializer
794 %xx = sext <8 x i8> %x to <8 x i16>
795 %yy = sext <8 x i8> %y to <8 x i16>
796 %m = mul <8 x i16> %xx, %yy
797 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
798 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
802 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
803 ; CHECK-LABEL: add_v16i8_v16i8:
804 ; CHECK: @ %bb.0: @ %entry
805 ; CHECK-NEXT: vpt.i8 eq, q2, zr
806 ; CHECK-NEXT: vmlavt.u8 r0, q0, q1
807 ; CHECK-NEXT: uxtb r0, r0
810 %c = icmp eq <16 x i8> %b, zeroinitializer
811 %m = mul <16 x i8> %x, %y
812 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
813 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
817 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
818 ; CHECK-LABEL: add_v16i8_v16i64_zext:
819 ; CHECK: @ %bb.0: @ %entry
820 ; CHECK-NEXT: .save {r4, lr}
821 ; CHECK-NEXT: push {r4, lr}
822 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
823 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
824 ; CHECK-NEXT: .pad #32
825 ; CHECK-NEXT: sub sp, #32
826 ; CHECK-NEXT: vmov q4, q0
827 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
828 ; CHECK-NEXT: vmov.i8 q2, #0xff
829 ; CHECK-NEXT: vmov.i8 q0, #0x0
830 ; CHECK-NEXT: vpsel q5, q2, q0
831 ; CHECK-NEXT: vmov q3, q2
832 ; CHECK-NEXT: vmov.u8 r0, q5[0]
833 ; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill
834 ; CHECK-NEXT: vmov.16 q2[0], r0
835 ; CHECK-NEXT: vmov.u8 r0, q5[1]
836 ; CHECK-NEXT: vmov.16 q2[1], r0
837 ; CHECK-NEXT: vmov.u8 r0, q5[2]
838 ; CHECK-NEXT: vmov.16 q2[2], r0
839 ; CHECK-NEXT: vmov.u8 r0, q5[3]
840 ; CHECK-NEXT: vmov.16 q2[3], r0
841 ; CHECK-NEXT: vmov.u8 r0, q5[4]
842 ; CHECK-NEXT: vmov.16 q2[4], r0
843 ; CHECK-NEXT: vmov.u8 r0, q5[5]
844 ; CHECK-NEXT: vmov.16 q2[5], r0
845 ; CHECK-NEXT: vmov.u8 r0, q5[6]
846 ; CHECK-NEXT: vmov.16 q2[6], r0
847 ; CHECK-NEXT: vmov.u8 r0, q5[7]
848 ; CHECK-NEXT: vmov.16 q2[7], r0
849 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
850 ; CHECK-NEXT: vcmp.i16 ne, q2, zr
851 ; CHECK-NEXT: vmov.u8 r3, q4[0]
852 ; CHECK-NEXT: vpsel q6, q3, q0
853 ; CHECK-NEXT: vmov.u16 r0, q6[2]
854 ; CHECK-NEXT: vmov.u16 r1, q6[0]
855 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
856 ; CHECK-NEXT: vmov.u16 r0, q6[3]
857 ; CHECK-NEXT: vmov.u16 r1, q6[1]
858 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
859 ; CHECK-NEXT: vcmp.i32 ne, q2, zr
860 ; CHECK-NEXT: vmov.i64 q2, #0xff
861 ; CHECK-NEXT: vmrs r0, p0
862 ; CHECK-NEXT: and r2, r0, #1
863 ; CHECK-NEXT: ubfx r1, r0, #4, #1
864 ; CHECK-NEXT: rsbs r2, r2, #0
865 ; CHECK-NEXT: rsbs r1, r1, #0
866 ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1
867 ; CHECK-NEXT: vmov q7[3], q7[1], r2, r1
868 ; CHECK-NEXT: vmov.u8 r1, q1[1]
869 ; CHECK-NEXT: vmov.u8 r2, q1[0]
870 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
871 ; CHECK-NEXT: vmov.u8 r2, q4[1]
872 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
873 ; CHECK-NEXT: vand q0, q0, q2
874 ; CHECK-NEXT: vand q3, q3, q2
875 ; CHECK-NEXT: vmov r1, s2
876 ; CHECK-NEXT: vmov r2, s14
877 ; CHECK-NEXT: vmov r3, s0
878 ; CHECK-NEXT: umull r1, r12, r2, r1
879 ; CHECK-NEXT: vmov r2, s12
880 ; CHECK-NEXT: umull r2, r3, r2, r3
881 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
882 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
883 ; CHECK-NEXT: vand q0, q0, q7
884 ; CHECK-NEXT: vmov r1, r12, d1
885 ; CHECK-NEXT: vmov r3, r2, d0
886 ; CHECK-NEXT: adds.w lr, r3, r1
887 ; CHECK-NEXT: ubfx r3, r0, #12, #1
888 ; CHECK-NEXT: ubfx r0, r0, #8, #1
889 ; CHECK-NEXT: rsb.w r3, r3, #0
890 ; CHECK-NEXT: rsb.w r0, r0, #0
891 ; CHECK-NEXT: vmov.u8 r1, q4[2]
892 ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3
893 ; CHECK-NEXT: adc.w r12, r12, r2
894 ; CHECK-NEXT: vmov q7[3], q7[1], r0, r3
895 ; CHECK-NEXT: vmov.u8 r0, q1[3]
896 ; CHECK-NEXT: vmov.u8 r3, q1[2]
897 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
898 ; CHECK-NEXT: vmov.u8 r3, q4[3]
899 ; CHECK-NEXT: vmov q3[2], q3[0], r1, r3
900 ; CHECK-NEXT: vand q0, q0, q2
901 ; CHECK-NEXT: vand q3, q3, q2
902 ; CHECK-NEXT: vmov r0, s2
903 ; CHECK-NEXT: vmov r1, s14
904 ; CHECK-NEXT: vmov r3, s0
905 ; CHECK-NEXT: vmov r2, s12
906 ; CHECK-NEXT: umull r0, r1, r1, r0
907 ; CHECK-NEXT: umull r2, r3, r2, r3
908 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
909 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
910 ; CHECK-NEXT: vand q0, q0, q7
911 ; CHECK-NEXT: vmov q7, q4
912 ; CHECK-NEXT: vmov r0, r1, d0
913 ; CHECK-NEXT: vmov r2, r3, d1
914 ; CHECK-NEXT: adds.w r0, r0, lr
915 ; CHECK-NEXT: adc.w r1, r1, r12
916 ; CHECK-NEXT: adds.w r12, r0, r2
917 ; CHECK-NEXT: adc.w lr, r1, r3
918 ; CHECK-NEXT: vmov.u16 r2, q6[6]
919 ; CHECK-NEXT: vmov.u16 r3, q6[4]
920 ; CHECK-NEXT: vmov.u8 r1, q4[4]
921 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
922 ; CHECK-NEXT: vmov.u16 r2, q6[7]
923 ; CHECK-NEXT: vmov.u16 r3, q6[5]
924 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
925 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
926 ; CHECK-NEXT: vmrs r2, p0
927 ; CHECK-NEXT: and r0, r2, #1
928 ; CHECK-NEXT: ubfx r3, r2, #4, #1
929 ; CHECK-NEXT: rsbs r0, r0, #0
930 ; CHECK-NEXT: rsbs r3, r3, #0
931 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r3
932 ; CHECK-NEXT: vmov q6[3], q6[1], r0, r3
933 ; CHECK-NEXT: vmov.u8 r0, q1[5]
934 ; CHECK-NEXT: vmov.u8 r3, q1[4]
935 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
936 ; CHECK-NEXT: vmov.u8 r3, q4[5]
937 ; CHECK-NEXT: vmov q3[2], q3[0], r1, r3
938 ; CHECK-NEXT: vand q0, q0, q2
939 ; CHECK-NEXT: vand q3, q3, q2
940 ; CHECK-NEXT: vmov r0, s2
941 ; CHECK-NEXT: vmov r1, s14
942 ; CHECK-NEXT: vmov r3, s0
943 ; CHECK-NEXT: vmov r4, s12
944 ; CHECK-NEXT: umull r0, r1, r1, r0
945 ; CHECK-NEXT: umull r3, r4, r4, r3
946 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
947 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r1
948 ; CHECK-NEXT: vand q0, q0, q6
949 ; CHECK-NEXT: vmov r0, r1, d0
950 ; CHECK-NEXT: vmov r3, r4, d1
951 ; CHECK-NEXT: adds.w r0, r0, r12
952 ; CHECK-NEXT: adc.w r1, r1, lr
953 ; CHECK-NEXT: adds.w r12, r0, r3
954 ; CHECK-NEXT: ubfx r3, r2, #12, #1
955 ; CHECK-NEXT: ubfx r2, r2, #8, #1
956 ; CHECK-NEXT: rsb.w r3, r3, #0
957 ; CHECK-NEXT: rsb.w r2, r2, #0
958 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3
959 ; CHECK-NEXT: adcs r1, r4
960 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3
961 ; CHECK-NEXT: vmov.u8 r2, q1[7]
962 ; CHECK-NEXT: vmov.u8 r3, q1[6]
963 ; CHECK-NEXT: vmov.u8 r4, q4[6]
964 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
965 ; CHECK-NEXT: vmov.u8 r3, q4[7]
966 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
967 ; CHECK-NEXT: vand q0, q0, q2
968 ; CHECK-NEXT: vand q3, q3, q2
969 ; CHECK-NEXT: vmov r2, s2
970 ; CHECK-NEXT: vmov r3, s14
971 ; CHECK-NEXT: vmov r0, s12
972 ; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
973 ; CHECK-NEXT: vmov r4, s0
974 ; CHECK-NEXT: umull r2, r3, r3, r2
975 ; CHECK-NEXT: umull r0, r4, r0, r4
976 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
977 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
978 ; CHECK-NEXT: vand q0, q0, q6
979 ; CHECK-NEXT: vmov r0, r2, d0
980 ; CHECK-NEXT: adds.w r0, r0, r12
981 ; CHECK-NEXT: adcs r1, r2
982 ; CHECK-NEXT: vmov r2, r3, d1
983 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
984 ; CHECK-NEXT: adds.w r12, r0, r2
985 ; CHECK-NEXT: vmov.u8 r2, q5[8]
986 ; CHECK-NEXT: vmov.16 q6[0], r2
987 ; CHECK-NEXT: vmov.u8 r2, q5[9]
988 ; CHECK-NEXT: vmov.16 q6[1], r2
989 ; CHECK-NEXT: vmov.u8 r2, q5[10]
990 ; CHECK-NEXT: vmov.16 q6[2], r2
991 ; CHECK-NEXT: vmov.u8 r2, q5[11]
992 ; CHECK-NEXT: vmov.16 q6[3], r2
993 ; CHECK-NEXT: vmov.u8 r2, q5[12]
994 ; CHECK-NEXT: vmov.16 q6[4], r2
995 ; CHECK-NEXT: vmov.u8 r2, q5[13]
996 ; CHECK-NEXT: vmov.16 q6[5], r2
997 ; CHECK-NEXT: vmov.u8 r2, q5[14]
998 ; CHECK-NEXT: vmov.16 q6[6], r2
999 ; CHECK-NEXT: vmov.u8 r2, q5[15]
1000 ; CHECK-NEXT: vmov.16 q6[7], r2
1001 ; CHECK-NEXT: adc.w lr, r1, r3
1002 ; CHECK-NEXT: vcmp.i16 ne, q6, zr
1003 ; CHECK-NEXT: vmov.u8 r0, q7[8]
1004 ; CHECK-NEXT: vpsel q3, q3, q0
1005 ; CHECK-NEXT: vmov.u16 r2, q3[2]
1006 ; CHECK-NEXT: vmov.u16 r3, q3[0]
1007 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1008 ; CHECK-NEXT: vmov.u16 r2, q3[3]
1009 ; CHECK-NEXT: vmov.u16 r3, q3[1]
1010 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1011 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1012 ; CHECK-NEXT: vmrs r2, p0
1013 ; CHECK-NEXT: and r4, r2, #1
1014 ; CHECK-NEXT: ubfx r3, r2, #4, #1
1015 ; CHECK-NEXT: rsbs r4, r4, #0
1016 ; CHECK-NEXT: rsbs r3, r3, #0
1017 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3
1018 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r3
1019 ; CHECK-NEXT: vmov.u8 r3, q1[9]
1020 ; CHECK-NEXT: vmov.u8 r4, q1[8]
1021 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
1022 ; CHECK-NEXT: vmov.u8 r4, q7[9]
1023 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r4
1024 ; CHECK-NEXT: vand q0, q0, q2
1025 ; CHECK-NEXT: vand q5, q5, q2
1026 ; CHECK-NEXT: vmov r3, s2
1027 ; CHECK-NEXT: vmov r0, s22
1028 ; CHECK-NEXT: vmov r4, s0
1029 ; CHECK-NEXT: vmov r1, s20
1030 ; CHECK-NEXT: umull r0, r3, r0, r3
1031 ; CHECK-NEXT: umull r1, r4, r1, r4
1032 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
1033 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
1034 ; CHECK-NEXT: vand q0, q0, q4
1035 ; CHECK-NEXT: vmov r0, r1, d0
1036 ; CHECK-NEXT: vmov r3, r4, d1
1037 ; CHECK-NEXT: adds.w r0, r0, r12
1038 ; CHECK-NEXT: adc.w r1, r1, lr
1039 ; CHECK-NEXT: adds.w r12, r0, r3
1040 ; CHECK-NEXT: ubfx r3, r2, #12, #1
1041 ; CHECK-NEXT: ubfx r2, r2, #8, #1
1042 ; CHECK-NEXT: rsb.w r3, r3, #0
1043 ; CHECK-NEXT: rsb.w r2, r2, #0
1044 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
1045 ; CHECK-NEXT: adcs r1, r4
1046 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
1047 ; CHECK-NEXT: vmov.u8 r2, q1[11]
1048 ; CHECK-NEXT: vmov.u8 r3, q1[10]
1049 ; CHECK-NEXT: vmov.u8 r4, q7[10]
1050 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1051 ; CHECK-NEXT: vmov.u8 r3, q7[11]
1052 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r3
1053 ; CHECK-NEXT: vand q0, q0, q2
1054 ; CHECK-NEXT: vand q5, q5, q2
1055 ; CHECK-NEXT: vmov r2, s2
1056 ; CHECK-NEXT: vmov r3, s22
1057 ; CHECK-NEXT: vmov r4, s0
1058 ; CHECK-NEXT: vmov r0, s20
1059 ; CHECK-NEXT: umull r2, r3, r3, r2
1060 ; CHECK-NEXT: umull r0, r4, r0, r4
1061 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1062 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
1063 ; CHECK-NEXT: vand q0, q0, q4
1064 ; CHECK-NEXT: vmov r0, r2, d0
1065 ; CHECK-NEXT: adds.w r0, r0, r12
1066 ; CHECK-NEXT: adcs r1, r2
1067 ; CHECK-NEXT: vmov r2, r3, d1
1068 ; CHECK-NEXT: adds.w r12, r0, r2
1069 ; CHECK-NEXT: vmov.u16 r2, q3[6]
1070 ; CHECK-NEXT: adc.w lr, r1, r3
1071 ; CHECK-NEXT: vmov.u16 r3, q3[4]
1072 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1073 ; CHECK-NEXT: vmov.u16 r2, q3[7]
1074 ; CHECK-NEXT: vmov.u16 r3, q3[5]
1075 ; CHECK-NEXT: vmov.u8 r0, q7[12]
1076 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1077 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
1078 ; CHECK-NEXT: vmrs r2, p0
1079 ; CHECK-NEXT: and r4, r2, #1
1080 ; CHECK-NEXT: ubfx r3, r2, #4, #1
1081 ; CHECK-NEXT: rsbs r4, r4, #0
1082 ; CHECK-NEXT: rsbs r3, r3, #0
1083 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
1084 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r3
1085 ; CHECK-NEXT: vmov.u8 r3, q1[13]
1086 ; CHECK-NEXT: vmov.u8 r4, q1[12]
1087 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
1088 ; CHECK-NEXT: vmov.u8 r4, q7[13]
1089 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r4
1090 ; CHECK-NEXT: vand q0, q0, q2
1091 ; CHECK-NEXT: vand q4, q4, q2
1092 ; CHECK-NEXT: vmov r3, s2
1093 ; CHECK-NEXT: vmov r0, s18
1094 ; CHECK-NEXT: vmov r4, s0
1095 ; CHECK-NEXT: vmov r1, s16
1096 ; CHECK-NEXT: umull r0, r3, r0, r3
1097 ; CHECK-NEXT: umull r1, r4, r1, r4
1098 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
1099 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
1100 ; CHECK-NEXT: vand q0, q0, q3
1101 ; CHECK-NEXT: vmov r0, r1, d0
1102 ; CHECK-NEXT: vmov r3, r4, d1
1103 ; CHECK-NEXT: adds.w r0, r0, r12
1104 ; CHECK-NEXT: adc.w r1, r1, lr
1105 ; CHECK-NEXT: adds.w r12, r0, r3
1106 ; CHECK-NEXT: ubfx r3, r2, #12, #1
1107 ; CHECK-NEXT: ubfx r2, r2, #8, #1
1108 ; CHECK-NEXT: rsb.w r3, r3, #0
1109 ; CHECK-NEXT: rsb.w r2, r2, #0
1110 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
1111 ; CHECK-NEXT: adcs r1, r4
1112 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3
1113 ; CHECK-NEXT: vmov.u8 r2, q1[15]
1114 ; CHECK-NEXT: vmov.u8 r3, q1[14]
1115 ; CHECK-NEXT: vmov.u8 r4, q7[14]
1116 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1117 ; CHECK-NEXT: vmov.u8 r3, q7[15]
1118 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
1119 ; CHECK-NEXT: vand q0, q0, q2
1120 ; CHECK-NEXT: vand q1, q1, q2
1121 ; CHECK-NEXT: vmov r2, s2
1122 ; CHECK-NEXT: vmov r3, s6
1123 ; CHECK-NEXT: vmov r4, s0
1124 ; CHECK-NEXT: vmov r0, s4
1125 ; CHECK-NEXT: umull r2, r3, r3, r2
1126 ; CHECK-NEXT: umull r0, r4, r0, r4
1127 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1128 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
1129 ; CHECK-NEXT: vand q0, q0, q3
1130 ; CHECK-NEXT: vmov r0, r2, d0
1131 ; CHECK-NEXT: adds.w r0, r0, r12
1132 ; CHECK-NEXT: adcs r1, r2
1133 ; CHECK-NEXT: vmov r2, r3, d1
1134 ; CHECK-NEXT: adds r0, r0, r2
1135 ; CHECK-NEXT: adcs r1, r3
1136 ; CHECK-NEXT: add sp, #32
1137 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1138 ; CHECK-NEXT: pop {r4, pc}
1140 %c = icmp eq <16 x i8> %b, zeroinitializer
1141 %xx = zext <16 x i8> %x to <16 x i64>
1142 %yy = zext <16 x i8> %y to <16 x i64>
1143 %m = mul <16 x i64> %xx, %yy
1144 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1145 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1149 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
1150 ; CHECK-LABEL: add_v16i8_v16i64_sext:
1151 ; CHECK: @ %bb.0: @ %entry
1152 ; CHECK-NEXT: .save {r4, lr}
1153 ; CHECK-NEXT: push {r4, lr}
1154 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1155 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1156 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
1157 ; CHECK-NEXT: vmov.i8 q2, #0x0
1158 ; CHECK-NEXT: vmov.i8 q3, #0xff
1159 ; CHECK-NEXT: vmov.s8 r3, q1[0]
1160 ; CHECK-NEXT: vpsel q4, q3, q2
1161 ; CHECK-NEXT: vmov.s8 r4, q0[4]
1162 ; CHECK-NEXT: vmov.u8 r0, q4[0]
1163 ; CHECK-NEXT: vmov.16 q5[0], r0
1164 ; CHECK-NEXT: vmov.u8 r0, q4[1]
1165 ; CHECK-NEXT: vmov.16 q5[1], r0
1166 ; CHECK-NEXT: vmov.u8 r0, q4[2]
1167 ; CHECK-NEXT: vmov.16 q5[2], r0
1168 ; CHECK-NEXT: vmov.u8 r0, q4[3]
1169 ; CHECK-NEXT: vmov.16 q5[3], r0
1170 ; CHECK-NEXT: vmov.u8 r0, q4[4]
1171 ; CHECK-NEXT: vmov.16 q5[4], r0
1172 ; CHECK-NEXT: vmov.u8 r0, q4[5]
1173 ; CHECK-NEXT: vmov.16 q5[5], r0
1174 ; CHECK-NEXT: vmov.u8 r0, q4[6]
1175 ; CHECK-NEXT: vmov.16 q5[6], r0
1176 ; CHECK-NEXT: vmov.u8 r0, q4[7]
1177 ; CHECK-NEXT: vmov.16 q5[7], r0
1178 ; CHECK-NEXT: vcmp.i16 ne, q5, zr
1179 ; CHECK-NEXT: vpsel q5, q3, q2
1180 ; CHECK-NEXT: vmov.u16 r0, q5[2]
1181 ; CHECK-NEXT: vmov.u16 r1, q5[0]
1182 ; CHECK-NEXT: vmov q6[2], q6[0], r1, r0
1183 ; CHECK-NEXT: vmov.u16 r0, q5[3]
1184 ; CHECK-NEXT: vmov.u16 r1, q5[1]
1185 ; CHECK-NEXT: vmov q6[3], q6[1], r1, r0
1186 ; CHECK-NEXT: vcmp.i32 ne, q6, zr
1187 ; CHECK-NEXT: vmrs r0, p0
1188 ; CHECK-NEXT: and r2, r0, #1
1189 ; CHECK-NEXT: ubfx r1, r0, #4, #1
1190 ; CHECK-NEXT: rsbs r2, r2, #0
1191 ; CHECK-NEXT: rsbs r1, r1, #0
1192 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r1
1193 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r1
1194 ; CHECK-NEXT: vmov.s8 r1, q1[1]
1195 ; CHECK-NEXT: vmov.s8 r2, q0[1]
1196 ; CHECK-NEXT: smull r1, r12, r2, r1
1197 ; CHECK-NEXT: vmov.s8 r2, q0[0]
1198 ; CHECK-NEXT: smull r2, r3, r2, r3
1199 ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1
1200 ; CHECK-NEXT: vmov q7[3], q7[1], r3, r12
1201 ; CHECK-NEXT: vand q6, q7, q6
1202 ; CHECK-NEXT: vmov r1, r12, d13
1203 ; CHECK-NEXT: vmov r3, r2, d12
1204 ; CHECK-NEXT: adds.w lr, r3, r1
1205 ; CHECK-NEXT: ubfx r3, r0, #12, #1
1206 ; CHECK-NEXT: ubfx r0, r0, #8, #1
1207 ; CHECK-NEXT: rsb.w r3, r3, #0
1208 ; CHECK-NEXT: rsb.w r0, r0, #0
1209 ; CHECK-NEXT: adc.w r12, r12, r2
1210 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r3
1211 ; CHECK-NEXT: vmov.s8 r1, q1[2]
1212 ; CHECK-NEXT: vmov q6[3], q6[1], r0, r3
1213 ; CHECK-NEXT: vmov.s8 r2, q0[2]
1214 ; CHECK-NEXT: vmov.s8 r0, q1[3]
1215 ; CHECK-NEXT: vmov.s8 r3, q0[3]
1216 ; CHECK-NEXT: smull r0, r3, r3, r0
1217 ; CHECK-NEXT: smull r1, r2, r2, r1
1218 ; CHECK-NEXT: vmov q7[2], q7[0], r1, r0
1219 ; CHECK-NEXT: vmov q7[3], q7[1], r2, r3
1220 ; CHECK-NEXT: vand q6, q7, q6
1221 ; CHECK-NEXT: vmov r0, r1, d12
1222 ; CHECK-NEXT: vmov r2, r3, d13
1223 ; CHECK-NEXT: adds.w r0, r0, lr
1224 ; CHECK-NEXT: adc.w r1, r1, r12
1225 ; CHECK-NEXT: adds.w r12, r0, r2
1226 ; CHECK-NEXT: adc.w lr, r1, r3
1227 ; CHECK-NEXT: vmov.u16 r2, q5[6]
1228 ; CHECK-NEXT: vmov.u16 r3, q5[4]
1229 ; CHECK-NEXT: vmov.s8 r1, q1[4]
1230 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2
1231 ; CHECK-NEXT: vmov.u16 r2, q5[7]
1232 ; CHECK-NEXT: vmov.u16 r3, q5[5]
1233 ; CHECK-NEXT: smull r1, r4, r4, r1
1234 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
1235 ; CHECK-NEXT: vcmp.i32 ne, q6, zr
1236 ; CHECK-NEXT: vmrs r2, p0
1237 ; CHECK-NEXT: and r0, r2, #1
1238 ; CHECK-NEXT: ubfx r3, r2, #4, #1
1239 ; CHECK-NEXT: rsbs r0, r0, #0
1240 ; CHECK-NEXT: rsbs r3, r3, #0
1241 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r3
1242 ; CHECK-NEXT: vmov q5[3], q5[1], r0, r3
1243 ; CHECK-NEXT: vmov.s8 r0, q1[5]
1244 ; CHECK-NEXT: vmov.s8 r3, q0[5]
1245 ; CHECK-NEXT: smull r0, r3, r3, r0
1246 ; CHECK-NEXT: vmov q6[2], q6[0], r1, r0
1247 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r3
1248 ; CHECK-NEXT: vand q5, q6, q5
1249 ; CHECK-NEXT: vmov r0, r1, d10
1250 ; CHECK-NEXT: vmov r3, r4, d11
1251 ; CHECK-NEXT: adds.w r0, r0, r12
1252 ; CHECK-NEXT: adc.w r1, r1, lr
1253 ; CHECK-NEXT: adds.w r12, r0, r3
1254 ; CHECK-NEXT: ubfx r3, r2, #12, #1
1255 ; CHECK-NEXT: ubfx r2, r2, #8, #1
1256 ; CHECK-NEXT: rsb.w r3, r3, #0
1257 ; CHECK-NEXT: rsb.w r2, r2, #0
1258 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3
1259 ; CHECK-NEXT: adcs r1, r4
1260 ; CHECK-NEXT: vmov q5[3], q5[1], r2, r3
1261 ; CHECK-NEXT: vmov.s8 r2, q1[7]
1262 ; CHECK-NEXT: vmov.s8 r3, q0[7]
1263 ; CHECK-NEXT: vmov.s8 r4, q1[6]
1264 ; CHECK-NEXT: vmov.s8 r0, q0[6]
1265 ; CHECK-NEXT: smull r2, r3, r3, r2
1266 ; CHECK-NEXT: smull r0, r4, r0, r4
1267 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r2
1268 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r3
1269 ; CHECK-NEXT: vand q5, q6, q5
1270 ; CHECK-NEXT: vmov r0, r2, d10
1271 ; CHECK-NEXT: adds.w r0, r0, r12
1272 ; CHECK-NEXT: adcs r1, r2
1273 ; CHECK-NEXT: vmov r2, r3, d11
1274 ; CHECK-NEXT: adds.w r12, r0, r2
1275 ; CHECK-NEXT: vmov.u8 r2, q4[8]
1276 ; CHECK-NEXT: vmov.16 q5[0], r2
1277 ; CHECK-NEXT: vmov.u8 r2, q4[9]
1278 ; CHECK-NEXT: vmov.16 q5[1], r2
1279 ; CHECK-NEXT: vmov.u8 r2, q4[10]
1280 ; CHECK-NEXT: vmov.16 q5[2], r2
1281 ; CHECK-NEXT: vmov.u8 r2, q4[11]
1282 ; CHECK-NEXT: vmov.16 q5[3], r2
1283 ; CHECK-NEXT: vmov.u8 r2, q4[12]
1284 ; CHECK-NEXT: vmov.16 q5[4], r2
1285 ; CHECK-NEXT: vmov.u8 r2, q4[13]
1286 ; CHECK-NEXT: vmov.16 q5[5], r2
1287 ; CHECK-NEXT: vmov.u8 r2, q4[14]
1288 ; CHECK-NEXT: vmov.16 q5[6], r2
1289 ; CHECK-NEXT: vmov.u8 r2, q4[15]
1290 ; CHECK-NEXT: vmov.16 q5[7], r2
1291 ; CHECK-NEXT: adc.w lr, r1, r3
1292 ; CHECK-NEXT: vcmp.i16 ne, q5, zr
1293 ; CHECK-NEXT: vmov.s8 r0, q1[8]
1294 ; CHECK-NEXT: vpsel q2, q3, q2
1295 ; CHECK-NEXT: vmov.s8 r1, q0[8]
1296 ; CHECK-NEXT: vmov.u16 r2, q2[2]
1297 ; CHECK-NEXT: vmov.u16 r3, q2[0]
1298 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
1299 ; CHECK-NEXT: vmov.u16 r2, q2[3]
1300 ; CHECK-NEXT: vmov.u16 r3, q2[1]
1301 ; CHECK-NEXT: smull r0, r1, r1, r0
1302 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2
1303 ; CHECK-NEXT: vcmp.i32 ne, q3, zr
1304 ; CHECK-NEXT: vmrs r2, p0
1305 ; CHECK-NEXT: and r4, r2, #1
1306 ; CHECK-NEXT: ubfx r3, r2, #4, #1
1307 ; CHECK-NEXT: rsbs r4, r4, #0
1308 ; CHECK-NEXT: rsbs r3, r3, #0
1309 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
1310 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r3
1311 ; CHECK-NEXT: vmov.s8 r3, q1[9]
1312 ; CHECK-NEXT: vmov.s8 r4, q0[9]
1313 ; CHECK-NEXT: smull r3, r4, r4, r3
1314 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3
1315 ; CHECK-NEXT: vmov q4[3], q4[1], r1, r4
1316 ; CHECK-NEXT: vand q3, q4, q3
1317 ; CHECK-NEXT: vmov r0, r1, d6
1318 ; CHECK-NEXT: vmov r3, r4, d7
1319 ; CHECK-NEXT: adds.w r0, r0, r12
1320 ; CHECK-NEXT: adc.w r1, r1, lr
1321 ; CHECK-NEXT: adds.w r12, r0, r3
1322 ; CHECK-NEXT: ubfx r3, r2, #12, #1
1323 ; CHECK-NEXT: ubfx r2, r2, #8, #1
1324 ; CHECK-NEXT: rsb.w r3, r3, #0
1325 ; CHECK-NEXT: rsb.w r2, r2, #0
1326 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
1327 ; CHECK-NEXT: adcs r1, r4
1328 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3
1329 ; CHECK-NEXT: vmov.s8 r2, q1[11]
1330 ; CHECK-NEXT: vmov.s8 r3, q0[11]
1331 ; CHECK-NEXT: vmov.s8 r4, q1[10]
1332 ; CHECK-NEXT: vmov.s8 r0, q0[10]
1333 ; CHECK-NEXT: smull r2, r3, r3, r2
1334 ; CHECK-NEXT: smull r0, r4, r0, r4
1335 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r2
1336 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r3
1337 ; CHECK-NEXT: vand q3, q4, q3
1338 ; CHECK-NEXT: vmov r0, r2, d6
1339 ; CHECK-NEXT: adds.w r0, r0, r12
1340 ; CHECK-NEXT: adcs r1, r2
1341 ; CHECK-NEXT: vmov r2, r3, d7
1342 ; CHECK-NEXT: adds.w r12, r0, r2
1343 ; CHECK-NEXT: vmov.u16 r2, q2[6]
1344 ; CHECK-NEXT: adc.w lr, r1, r3
1345 ; CHECK-NEXT: vmov.u16 r3, q2[4]
1346 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
1347 ; CHECK-NEXT: vmov.u16 r2, q2[7]
1348 ; CHECK-NEXT: vmov.u16 r3, q2[5]
1349 ; CHECK-NEXT: vmov.s8 r0, q1[12]
1350 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2
1351 ; CHECK-NEXT: vmov.s8 r1, q0[12]
1352 ; CHECK-NEXT: vcmp.i32 ne, q3, zr
1353 ; CHECK-NEXT: smull r0, r1, r1, r0
1354 ; CHECK-NEXT: vmrs r2, p0
1355 ; CHECK-NEXT: and r4, r2, #1
1356 ; CHECK-NEXT: ubfx r3, r2, #4, #1
1357 ; CHECK-NEXT: rsbs r4, r4, #0
1358 ; CHECK-NEXT: rsbs r3, r3, #0
1359 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r3
1360 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r3
1361 ; CHECK-NEXT: vmov.s8 r3, q1[13]
1362 ; CHECK-NEXT: vmov.s8 r4, q0[13]
1363 ; CHECK-NEXT: smull r3, r4, r4, r3
1364 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3
1365 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r4
1366 ; CHECK-NEXT: vand q2, q3, q2
1367 ; CHECK-NEXT: vmov r0, r1, d4
1368 ; CHECK-NEXT: vmov r3, r4, d5
1369 ; CHECK-NEXT: adds.w r0, r0, r12
1370 ; CHECK-NEXT: adc.w r1, r1, lr
1371 ; CHECK-NEXT: adds.w r12, r0, r3
1372 ; CHECK-NEXT: ubfx r3, r2, #12, #1
1373 ; CHECK-NEXT: ubfx r2, r2, #8, #1
1374 ; CHECK-NEXT: rsb.w r3, r3, #0
1375 ; CHECK-NEXT: rsb.w r2, r2, #0
1376 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
1377 ; CHECK-NEXT: adcs r1, r4
1378 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r3
1379 ; CHECK-NEXT: vmov.s8 r2, q1[15]
1380 ; CHECK-NEXT: vmov.s8 r3, q0[15]
1381 ; CHECK-NEXT: vmov.s8 r4, q1[14]
1382 ; CHECK-NEXT: vmov.s8 r0, q0[14]
1383 ; CHECK-NEXT: smull r2, r3, r3, r2
1384 ; CHECK-NEXT: smull r0, r4, r0, r4
1385 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
1386 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
1387 ; CHECK-NEXT: vand q0, q0, q2
1388 ; CHECK-NEXT: vmov r0, r2, d0
1389 ; CHECK-NEXT: adds.w r0, r0, r12
1390 ; CHECK-NEXT: adcs r1, r2
1391 ; CHECK-NEXT: vmov r2, r3, d1
1392 ; CHECK-NEXT: adds r0, r0, r2
1393 ; CHECK-NEXT: adcs r1, r3
1394 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1395 ; CHECK-NEXT: pop {r4, pc}
1397 %c = icmp eq <16 x i8> %b, zeroinitializer
1398 %xx = sext <16 x i8> %x to <16 x i64>
1399 %yy = sext <16 x i8> %y to <16 x i64>
1400 %m = mul <16 x i64> %xx, %yy
1401 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1402 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1406 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1407 ; CHECK-LABEL: add_v8i8_v8i64_zext:
1408 ; CHECK: @ %bb.0: @ %entry
1409 ; CHECK-NEXT: vmovlb.u8 q1, q1
1410 ; CHECK-NEXT: vmovlb.u8 q0, q0
1411 ; CHECK-NEXT: vmovlb.u8 q2, q2
1412 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1413 ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
1416 %c = icmp eq <8 x i8> %b, zeroinitializer
1417 %xx = zext <8 x i8> %x to <8 x i64>
1418 %yy = zext <8 x i8> %y to <8 x i64>
1419 %m = mul <8 x i64> %xx, %yy
1420 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1421 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1425 define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1426 ; CHECK-LABEL: add_v8i8_v8i64_sext:
1427 ; CHECK: @ %bb.0: @ %entry
1428 ; CHECK-NEXT: vmovlb.s8 q1, q1
1429 ; CHECK-NEXT: vmovlb.s8 q0, q0
1430 ; CHECK-NEXT: vmovlb.u8 q2, q2
1431 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1432 ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
1435 %c = icmp eq <8 x i8> %b, zeroinitializer
1436 %xx = sext <8 x i8> %x to <8 x i64>
1437 %yy = sext <8 x i8> %y to <8 x i64>
1438 %m = mul <8 x i64> %xx, %yy
1439 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1440 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1444 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1445 ; CHECK-LABEL: add_v4i8_v4i64_zext:
1446 ; CHECK: @ %bb.0: @ %entry
1447 ; CHECK-NEXT: vmov.i32 q3, #0xff
1448 ; CHECK-NEXT: vand q1, q1, q3
1449 ; CHECK-NEXT: vand q0, q0, q3
1450 ; CHECK-NEXT: vand q2, q2, q3
1451 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1452 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1455 %c = icmp eq <4 x i8> %b, zeroinitializer
1456 %xx = zext <4 x i8> %x to <4 x i64>
1457 %yy = zext <4 x i8> %y to <4 x i64>
1458 %m = mul <4 x i64> %xx, %yy
1459 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1460 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1464 define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1465 ; CHECK-LABEL: add_v4i8_v4i64_sext:
1466 ; CHECK: @ %bb.0: @ %entry
1467 ; CHECK-NEXT: vmovlb.s8 q1, q1
1468 ; CHECK-NEXT: vmovlb.s8 q0, q0
1469 ; CHECK-NEXT: vmov.i32 q3, #0xff
1470 ; CHECK-NEXT: vmovlb.s16 q1, q1
1471 ; CHECK-NEXT: vand q2, q2, q3
1472 ; CHECK-NEXT: vmovlb.s16 q0, q0
1473 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1474 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1477 %c = icmp eq <4 x i8> %b, zeroinitializer
1478 %xx = sext <4 x i8> %x to <4 x i64>
1479 %yy = sext <4 x i8> %y to <4 x i64>
1480 %m = mul <4 x i64> %xx, %yy
1481 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1482 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1486 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1487 ; CHECK-LABEL: add_v4i8i16_v4i64_zext:
1488 ; CHECK: @ %bb.0: @ %entry
1489 ; CHECK-NEXT: vmov.i32 q3, #0xff
1490 ; CHECK-NEXT: vmovlb.u16 q1, q1
1491 ; CHECK-NEXT: vand q0, q0, q3
1492 ; CHECK-NEXT: vand q2, q2, q3
1493 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1494 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1497 %c = icmp eq <4 x i8> %b, zeroinitializer
1498 %xx = zext <4 x i8> %x to <4 x i64>
1499 %yy = zext <4 x i16> %y to <4 x i64>
1500 %m = mul <4 x i64> %xx, %yy
1501 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1502 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1506 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1507 ; CHECK-LABEL: add_v4i8i16_v4i64_sext:
1508 ; CHECK: @ %bb.0: @ %entry
1509 ; CHECK-NEXT: vmovlb.s8 q0, q0
1510 ; CHECK-NEXT: vmov.i32 q3, #0xff
1511 ; CHECK-NEXT: vand q2, q2, q3
1512 ; CHECK-NEXT: vmovlb.s16 q1, q1
1513 ; CHECK-NEXT: vmovlb.s16 q0, q0
1514 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1515 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1518 %c = icmp eq <4 x i8> %b, zeroinitializer
1519 %xx = sext <4 x i8> %x to <4 x i64>
1520 %yy = sext <4 x i16> %y to <4 x i64>
1521 %m = mul <4 x i64> %xx, %yy
1522 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1523 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1527 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1528 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
1529 ; CHECK: @ %bb.0: @ %entry
1530 ; CHECK-NEXT: vmov.i32 q3, #0xff
1531 ; CHECK-NEXT: vmovlb.u16 q1, q1
1532 ; CHECK-NEXT: vand q0, q0, q3
1533 ; CHECK-NEXT: vand q2, q2, q3
1534 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1535 ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
1538 %c = icmp eq <4 x i8> %b, zeroinitializer
1539 %xx = zext <4 x i8> %x to <4 x i32>
1540 %yy = zext <4 x i16> %y to <4 x i32>
1541 %mm = mul <4 x i32> %xx, %yy
1542 %m = zext <4 x i32> %mm to <4 x i64>
1543 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1544 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1548 define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1549 ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
1550 ; CHECK: @ %bb.0: @ %entry
1551 ; CHECK-NEXT: vmovlb.s8 q0, q0
1552 ; CHECK-NEXT: vmov.i32 q3, #0xff
1553 ; CHECK-NEXT: vand q2, q2, q3
1554 ; CHECK-NEXT: vmovlb.s16 q1, q1
1555 ; CHECK-NEXT: vmovlb.s16 q0, q0
1556 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1557 ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
1560 %c = icmp eq <4 x i8> %b, zeroinitializer
1561 %xx = sext <4 x i8> %x to <4 x i32>
1562 %yy = sext <4 x i16> %y to <4 x i32>
1563 %mm = mul <4 x i32> %xx, %yy
1564 %m = sext <4 x i32> %mm to <4 x i64>
1565 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1566 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1570 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1571 ; CHECK-LABEL: add_v2i8_v2i64_zext:
1572 ; CHECK: @ %bb.0: @ %entry
1573 ; CHECK-NEXT: vmov.i64 q3, #0xff
1574 ; CHECK-NEXT: vand q1, q1, q3
1575 ; CHECK-NEXT: vand q0, q0, q3
1576 ; CHECK-NEXT: vmov r0, s6
1577 ; CHECK-NEXT: vmov r1, s2
1578 ; CHECK-NEXT: vmov r2, s4
1579 ; CHECK-NEXT: vand q1, q2, q3
1580 ; CHECK-NEXT: vmov r3, s0
1581 ; CHECK-NEXT: umull r0, r1, r1, r0
1582 ; CHECK-NEXT: umull r2, r3, r3, r2
1583 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
1584 ; CHECK-NEXT: vmov r0, s6
1585 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
1586 ; CHECK-NEXT: vmov r1, s4
1587 ; CHECK-NEXT: cmp r0, #0
1588 ; CHECK-NEXT: cset r0, eq
1589 ; CHECK-NEXT: cmp r0, #0
1590 ; CHECK-NEXT: csetm r0, ne
1591 ; CHECK-NEXT: cmp r1, #0
1592 ; CHECK-NEXT: cset r1, eq
1593 ; CHECK-NEXT: cmp r1, #0
1594 ; CHECK-NEXT: csetm r1, ne
1595 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
1596 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
1597 ; CHECK-NEXT: vand q0, q0, q1
1598 ; CHECK-NEXT: vmov r0, r1, d1
1599 ; CHECK-NEXT: vmov r2, r3, d0
1600 ; CHECK-NEXT: adds r0, r0, r2
1601 ; CHECK-NEXT: adcs r1, r3
1604 %c = icmp eq <2 x i8> %b, zeroinitializer
1605 %xx = zext <2 x i8> %x to <2 x i64>
1606 %yy = zext <2 x i8> %y to <2 x i64>
1607 %m = mul <2 x i64> %xx, %yy
1608 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1609 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1613 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1614 ; CHECK-LABEL: add_v2i8_v2i64_sext:
1615 ; CHECK: @ %bb.0: @ %entry
1616 ; CHECK-NEXT: vmov.i32 q3, #0xff
1617 ; CHECK-NEXT: vmov r2, s4
1618 ; CHECK-NEXT: vand q2, q2, q3
1619 ; CHECK-NEXT: vmov r3, s0
1620 ; CHECK-NEXT: vmov r0, s10
1621 ; CHECK-NEXT: vmov r1, s8
1622 ; CHECK-NEXT: sxtb r2, r2
1623 ; CHECK-NEXT: cmp r0, #0
1624 ; CHECK-NEXT: sxtb r3, r3
1625 ; CHECK-NEXT: cset r0, eq
1626 ; CHECK-NEXT: smull r2, r3, r3, r2
1627 ; CHECK-NEXT: cmp r0, #0
1628 ; CHECK-NEXT: csetm r0, ne
1629 ; CHECK-NEXT: cmp r1, #0
1630 ; CHECK-NEXT: cset r1, eq
1631 ; CHECK-NEXT: cmp r1, #0
1632 ; CHECK-NEXT: csetm r1, ne
1633 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
1634 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
1635 ; CHECK-NEXT: vmov r0, s6
1636 ; CHECK-NEXT: vmov r1, s2
1637 ; CHECK-NEXT: sxtb r0, r0
1638 ; CHECK-NEXT: sxtb r1, r1
1639 ; CHECK-NEXT: smull r0, r1, r1, r0
1640 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
1641 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
1642 ; CHECK-NEXT: vand q0, q0, q2
1643 ; CHECK-NEXT: vmov r0, r1, d1
1644 ; CHECK-NEXT: vmov r2, r3, d0
1645 ; CHECK-NEXT: adds r0, r0, r2
1646 ; CHECK-NEXT: adcs r1, r3
1649 %c = icmp eq <2 x i8> %b, zeroinitializer
1650 %xx = sext <2 x i8> %x to <2 x i64>
1651 %yy = sext <2 x i8> %y to <2 x i64>
1652 %m = mul <2 x i64> %xx, %yy
1653 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1654 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1658 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
1659 ; CHECK-LABEL: add_v2i64_v2i64:
1660 ; CHECK: @ %bb.0: @ %entry
1661 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
1662 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
1663 ; CHECK-NEXT: vmov r0, r12, d3
1664 ; CHECK-NEXT: vmov r2, lr, d1
1665 ; CHECK-NEXT: vmov r4, r9, d2
1666 ; CHECK-NEXT: vmov r6, r7, d0
1667 ; CHECK-NEXT: umull r1, r8, r2, r0
1668 ; CHECK-NEXT: umull r3, r5, r6, r4
1669 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r1
1670 ; CHECK-NEXT: mla r1, r2, r12, r8
1671 ; CHECK-NEXT: mla r0, lr, r0, r1
1672 ; CHECK-NEXT: mla r1, r6, r9, r5
1673 ; CHECK-NEXT: mla r1, r7, r4, r1
1674 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
1675 ; CHECK-NEXT: vmov r0, r1, d5
1676 ; CHECK-NEXT: orrs r0, r1
1677 ; CHECK-NEXT: vmov r1, r2, d4
1678 ; CHECK-NEXT: cset r0, eq
1679 ; CHECK-NEXT: cmp r0, #0
1680 ; CHECK-NEXT: csetm r0, ne
1681 ; CHECK-NEXT: orrs r1, r2
1682 ; CHECK-NEXT: cset r1, eq
1683 ; CHECK-NEXT: cmp r1, #0
1684 ; CHECK-NEXT: csetm r1, ne
1685 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
1686 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
1687 ; CHECK-NEXT: vand q0, q0, q1
1688 ; CHECK-NEXT: vmov r0, r1, d1
1689 ; CHECK-NEXT: vmov r2, r3, d0
1690 ; CHECK-NEXT: adds r0, r0, r2
1691 ; CHECK-NEXT: adcs r1, r3
1692 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
1694 %c = icmp eq <2 x i64> %b, zeroinitializer
1695 %m = mul <2 x i64> %x, %y
1696 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1697 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1701 define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) {
1702 ; CHECK-LABEL: add_v4i32_v4i32_acc:
1703 ; CHECK: @ %bb.0: @ %entry
1704 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1705 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1708 %c = icmp eq <4 x i32> %b, zeroinitializer
1709 %m = mul <4 x i32> %x, %y
1710 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1711 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1716 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1717 ; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
1718 ; CHECK: @ %bb.0: @ %entry
1719 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1720 ; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1
1723 %c = icmp eq <4 x i32> %b, zeroinitializer
1724 %xx = zext <4 x i32> %x to <4 x i64>
1725 %yy = zext <4 x i32> %y to <4 x i64>
1726 %m = mul <4 x i64> %xx, %yy
1727 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1728 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1733 define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1734 ; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1735 ; CHECK: @ %bb.0: @ %entry
1736 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1737 ; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1
1740 %c = icmp eq <4 x i32> %b, zeroinitializer
1741 %xx = sext <4 x i32> %x to <4 x i64>
1742 %yy = sext <4 x i32> %y to <4 x i64>
1743 %m = mul <4 x i64> %xx, %yy
1744 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1745 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1750 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1751 ; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1752 ; CHECK: @ %bb.0: @ %entry
1753 ; CHECK-NEXT: .save {r7, lr}
1754 ; CHECK-NEXT: push {r7, lr}
1755 ; CHECK-NEXT: vmov r2, s10
1756 ; CHECK-NEXT: vmullb.u32 q3, q0, q1
1757 ; CHECK-NEXT: vmov r3, s8
1758 ; CHECK-NEXT: cmp r2, #0
1759 ; CHECK-NEXT: cset r2, eq
1760 ; CHECK-NEXT: cmp r2, #0
1761 ; CHECK-NEXT: csetm r2, ne
1762 ; CHECK-NEXT: cmp r3, #0
1763 ; CHECK-NEXT: cset r3, eq
1764 ; CHECK-NEXT: cmp r3, #0
1765 ; CHECK-NEXT: csetm r3, ne
1766 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1767 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1768 ; CHECK-NEXT: vand q0, q3, q0
1769 ; CHECK-NEXT: vmov lr, r12, d1
1770 ; CHECK-NEXT: vmov r3, r2, d0
1771 ; CHECK-NEXT: adds.w r3, r3, lr
1772 ; CHECK-NEXT: adc.w r2, r2, r12
1773 ; CHECK-NEXT: adds r0, r0, r3
1774 ; CHECK-NEXT: adcs r1, r2
1775 ; CHECK-NEXT: pop {r7, pc}
1777 %c = icmp eq <2 x i32> %b, zeroinitializer
1778 %xx = zext <2 x i32> %x to <2 x i64>
1779 %yy = zext <2 x i32> %y to <2 x i64>
1780 %m = mul <2 x i64> %xx, %yy
1781 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1782 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1787 define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1788 ; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1789 ; CHECK: @ %bb.0: @ %entry
1790 ; CHECK-NEXT: .save {r7, lr}
1791 ; CHECK-NEXT: push {r7, lr}
1792 ; CHECK-NEXT: vmov r2, s10
1793 ; CHECK-NEXT: vmullb.s32 q3, q0, q1
1794 ; CHECK-NEXT: vmov r3, s8
1795 ; CHECK-NEXT: cmp r2, #0
1796 ; CHECK-NEXT: cset r2, eq
1797 ; CHECK-NEXT: cmp r2, #0
1798 ; CHECK-NEXT: csetm r2, ne
1799 ; CHECK-NEXT: cmp r3, #0
1800 ; CHECK-NEXT: cset r3, eq
1801 ; CHECK-NEXT: cmp r3, #0
1802 ; CHECK-NEXT: csetm r3, ne
1803 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
1804 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
1805 ; CHECK-NEXT: vand q0, q3, q0
1806 ; CHECK-NEXT: vmov lr, r12, d1
1807 ; CHECK-NEXT: vmov r3, r2, d0
1808 ; CHECK-NEXT: adds.w r3, r3, lr
1809 ; CHECK-NEXT: adc.w r2, r2, r12
1810 ; CHECK-NEXT: adds r0, r0, r3
1811 ; CHECK-NEXT: adcs r1, r2
1812 ; CHECK-NEXT: pop {r7, pc}
1814 %c = icmp eq <2 x i32> %b, zeroinitializer
1815 %xx = sext <2 x i32> %x to <2 x i64>
1816 %yy = sext <2 x i32> %y to <2 x i64>
1817 %m = mul <2 x i64> %xx, %yy
1818 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1819 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1824 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1825 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1826 ; CHECK: @ %bb.0: @ %entry
1827 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1828 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
1831 %c = icmp eq <8 x i16> %b, zeroinitializer
1832 %xx = zext <8 x i16> %x to <8 x i32>
1833 %yy = zext <8 x i16> %y to <8 x i32>
1834 %m = mul <8 x i32> %xx, %yy
1835 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1836 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1841 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1842 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1843 ; CHECK: @ %bb.0: @ %entry
1844 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1845 ; CHECK-NEXT: vmlavat.s16 r0, q0, q1
1848 %c = icmp eq <8 x i16> %b, zeroinitializer
1849 %xx = sext <8 x i16> %x to <8 x i32>
1850 %yy = sext <8 x i16> %y to <8 x i32>
1851 %m = mul <8 x i32> %xx, %yy
1852 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1853 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1858 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1859 ; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1860 ; CHECK: @ %bb.0: @ %entry
1861 ; CHECK-NEXT: vmovlb.u16 q1, q1
1862 ; CHECK-NEXT: vmovlb.u16 q0, q0
1863 ; CHECK-NEXT: vmovlb.u16 q2, q2
1864 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1865 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1868 %c = icmp eq <4 x i16> %b, zeroinitializer
1869 %xx = zext <4 x i16> %x to <4 x i32>
1870 %yy = zext <4 x i16> %y to <4 x i32>
1871 %m = mul <4 x i32> %xx, %yy
1872 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1873 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1878 define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1879 ; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1880 ; CHECK: @ %bb.0: @ %entry
1881 ; CHECK-NEXT: vmovlb.s16 q1, q1
1882 ; CHECK-NEXT: vmovlb.s16 q0, q0
1883 ; CHECK-NEXT: vmovlb.u16 q2, q2
1884 ; CHECK-NEXT: vpt.i32 eq, q2, zr
1885 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
1888 %c = icmp eq <4 x i16> %b, zeroinitializer
1889 %xx = sext <4 x i16> %x to <4 x i32>
1890 %yy = sext <4 x i16> %y to <4 x i32>
1891 %m = mul <4 x i32> %xx, %yy
1892 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1893 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1898 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) {
1899 ; CHECK-LABEL: add_v8i16_v8i16_acc:
1900 ; CHECK: @ %bb.0: @ %entry
1901 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1902 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
1903 ; CHECK-NEXT: uxth r0, r0
1906 %c = icmp eq <8 x i16> %b, zeroinitializer
1907 %m = mul <8 x i16> %x, %y
1908 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
1909 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
1914 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1915 ; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1916 ; CHECK: @ %bb.0: @ %entry
1917 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1918 ; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
1921 %c = icmp eq <8 x i16> %b, zeroinitializer
1922 %xx = zext <8 x i16> %x to <8 x i64>
1923 %yy = zext <8 x i16> %y to <8 x i64>
1924 %m = mul <8 x i64> %xx, %yy
1925 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1926 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1931 define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1932 ; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1933 ; CHECK: @ %bb.0: @ %entry
1934 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1935 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
1938 %c = icmp eq <8 x i16> %b, zeroinitializer
1939 %xx = sext <8 x i16> %x to <8 x i64>
1940 %yy = sext <8 x i16> %y to <8 x i64>
1941 %m = mul <8 x i64> %xx, %yy
1942 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1943 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1948 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1949 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1950 ; CHECK: @ %bb.0: @ %entry
1951 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1952 ; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
1955 %c = icmp eq <8 x i16> %b, zeroinitializer
1956 %xx = zext <8 x i16> %x to <8 x i32>
1957 %yy = zext <8 x i16> %y to <8 x i32>
1958 %m = mul <8 x i32> %xx, %yy
1959 %ma = zext <8 x i32> %m to <8 x i64>
1960 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1961 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1966 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1967 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1968 ; CHECK: @ %bb.0: @ %entry
1969 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1970 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
1973 %c = icmp eq <8 x i16> %b, zeroinitializer
1974 %xx = sext <8 x i16> %x to <8 x i32>
1975 %yy = sext <8 x i16> %y to <8 x i32>
1976 %m = mul <8 x i32> %xx, %yy
1977 %ma = sext <8 x i32> %m to <8 x i64>
1978 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1979 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1984 define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1985 ; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1986 ; CHECK: @ %bb.0: @ %entry
1987 ; CHECK-NEXT: vpt.i16 eq, q2, zr
1988 ; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0
1991 %c = icmp eq <8 x i16> %b, zeroinitializer
1992 %xx = sext <8 x i16> %x to <8 x i32>
1993 %m = mul <8 x i32> %xx, %xx
1994 %ma = zext <8 x i32> %m to <8 x i64>
1995 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1996 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
2001 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
2002 ; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
2003 ; CHECK: @ %bb.0: @ %entry
2004 ; CHECK-NEXT: .save {r7, lr}
2005 ; CHECK-NEXT: push {r7, lr}
2006 ; CHECK-NEXT: vmov.i64 q3, #0xffff
2007 ; CHECK-NEXT: vand q1, q1, q3
2008 ; CHECK-NEXT: vand q0, q0, q3
2009 ; CHECK-NEXT: vmov r2, s6
2010 ; CHECK-NEXT: vmov r3, s2
2011 ; CHECK-NEXT: umull lr, r12, r3, r2
2012 ; CHECK-NEXT: vmov r3, s4
2013 ; CHECK-NEXT: vmov r2, s0
2014 ; CHECK-NEXT: vand q1, q2, q3
2015 ; CHECK-NEXT: umull r2, r3, r2, r3
2016 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2017 ; CHECK-NEXT: vmov r2, s6
2018 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2019 ; CHECK-NEXT: vmov r3, s4
2020 ; CHECK-NEXT: cmp r2, #0
2021 ; CHECK-NEXT: cset r2, eq
2022 ; CHECK-NEXT: cmp r2, #0
2023 ; CHECK-NEXT: csetm r2, ne
2024 ; CHECK-NEXT: cmp r3, #0
2025 ; CHECK-NEXT: cset r3, eq
2026 ; CHECK-NEXT: cmp r3, #0
2027 ; CHECK-NEXT: csetm r3, ne
2028 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
2029 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
2030 ; CHECK-NEXT: vand q0, q0, q1
2031 ; CHECK-NEXT: vmov lr, r12, d1
2032 ; CHECK-NEXT: vmov r3, r2, d0
2033 ; CHECK-NEXT: adds.w r3, r3, lr
2034 ; CHECK-NEXT: adc.w r2, r2, r12
2035 ; CHECK-NEXT: adds r0, r0, r3
2036 ; CHECK-NEXT: adcs r1, r2
2037 ; CHECK-NEXT: pop {r7, pc}
2039 %c = icmp eq <2 x i16> %b, zeroinitializer
2040 %xx = zext <2 x i16> %x to <2 x i64>
2041 %yy = zext <2 x i16> %y to <2 x i64>
2042 %m = mul <2 x i64> %xx, %yy
2043 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2044 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2049 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
2050 ; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
2051 ; CHECK: @ %bb.0: @ %entry
2052 ; CHECK-NEXT: .save {r7, lr}
2053 ; CHECK-NEXT: push {r7, lr}
2054 ; CHECK-NEXT: vmov.i32 q3, #0xffff
2055 ; CHECK-NEXT: vand q2, q2, q3
2056 ; CHECK-NEXT: vmov r2, s10
2057 ; CHECK-NEXT: vmov r3, s8
2058 ; CHECK-NEXT: cmp r2, #0
2059 ; CHECK-NEXT: cset r2, eq
2060 ; CHECK-NEXT: cmp r2, #0
2061 ; CHECK-NEXT: csetm r2, ne
2062 ; CHECK-NEXT: cmp r3, #0
2063 ; CHECK-NEXT: cset r3, eq
2064 ; CHECK-NEXT: cmp r3, #0
2065 ; CHECK-NEXT: csetm r3, ne
2066 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
2067 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
2068 ; CHECK-NEXT: vmov r2, s6
2069 ; CHECK-NEXT: vmov r3, s2
2070 ; CHECK-NEXT: sxth r2, r2
2071 ; CHECK-NEXT: sxth r3, r3
2072 ; CHECK-NEXT: smull lr, r12, r3, r2
2073 ; CHECK-NEXT: vmov r3, s4
2074 ; CHECK-NEXT: vmov r2, s0
2075 ; CHECK-NEXT: sxth r3, r3
2076 ; CHECK-NEXT: sxth r2, r2
2077 ; CHECK-NEXT: smull r2, r3, r2, r3
2078 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2079 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2080 ; CHECK-NEXT: vand q0, q0, q2
2081 ; CHECK-NEXT: vmov lr, r12, d1
2082 ; CHECK-NEXT: vmov r3, r2, d0
2083 ; CHECK-NEXT: adds.w r3, r3, lr
2084 ; CHECK-NEXT: adc.w r2, r2, r12
2085 ; CHECK-NEXT: adds r0, r0, r3
2086 ; CHECK-NEXT: adcs r1, r2
2087 ; CHECK-NEXT: pop {r7, pc}
2089 %c = icmp eq <2 x i16> %b, zeroinitializer
2090 %xx = sext <2 x i16> %x to <2 x i64>
2091 %yy = sext <2 x i16> %y to <2 x i64>
2092 %m = mul <2 x i64> %xx, %yy
2093 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2094 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2099 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2100 ; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
2101 ; CHECK: @ %bb.0: @ %entry
2102 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2103 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2106 %c = icmp eq <16 x i8> %b, zeroinitializer
2107 %xx = zext <16 x i8> %x to <16 x i32>
2108 %yy = zext <16 x i8> %y to <16 x i32>
2109 %m = mul <16 x i32> %xx, %yy
2110 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2111 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2116 define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2117 ; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
2118 ; CHECK: @ %bb.0: @ %entry
2119 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2120 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2123 %c = icmp eq <16 x i8> %b, zeroinitializer
2124 %xx = sext <16 x i8> %x to <16 x i32>
2125 %yy = sext <16 x i8> %y to <16 x i32>
2126 %m = mul <16 x i32> %xx, %yy
2127 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2128 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2133 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2134 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
2135 ; CHECK: @ %bb.0: @ %entry
2136 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2137 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2140 %c = icmp eq <16 x i8> %b, zeroinitializer
2141 %xx = zext <16 x i8> %x to <16 x i16>
2142 %yy = zext <16 x i8> %y to <16 x i16>
2143 %m = mul <16 x i16> %xx, %yy
2144 %ma = zext <16 x i16> %m to <16 x i32>
2145 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2146 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2151 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2152 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
2153 ; CHECK: @ %bb.0: @ %entry
2154 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2155 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2158 %c = icmp eq <16 x i8> %b, zeroinitializer
2159 %xx = sext <16 x i8> %x to <16 x i16>
2160 %yy = sext <16 x i8> %y to <16 x i16>
2161 %m = mul <16 x i16> %xx, %yy
2162 %ma = sext <16 x i16> %m to <16 x i32>
2163 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2164 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2169 define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2170 ; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
2171 ; CHECK: @ %bb.0: @ %entry
2172 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2173 ; CHECK-NEXT: vmlavat.s8 r0, q0, q0
2176 %c = icmp eq <16 x i8> %b, zeroinitializer
2177 %xx = sext <16 x i8> %x to <16 x i16>
2178 %m = mul <16 x i16> %xx, %xx
2179 %ma = zext <16 x i16> %m to <16 x i32>
2180 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2181 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2186 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2187 ; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
2188 ; CHECK: @ %bb.0: @ %entry
2189 ; CHECK-NEXT: vmov.i32 q3, #0xff
2190 ; CHECK-NEXT: vand q1, q1, q3
2191 ; CHECK-NEXT: vand q0, q0, q3
2192 ; CHECK-NEXT: vand q2, q2, q3
2193 ; CHECK-NEXT: vpt.i32 eq, q2, zr
2194 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
2197 %c = icmp eq <4 x i8> %b, zeroinitializer
2198 %xx = zext <4 x i8> %x to <4 x i32>
2199 %yy = zext <4 x i8> %y to <4 x i32>
2200 %m = mul <4 x i32> %xx, %yy
2201 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2202 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2207 define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2208 ; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
2209 ; CHECK: @ %bb.0: @ %entry
2210 ; CHECK-NEXT: vmovlb.s8 q1, q1
2211 ; CHECK-NEXT: vmovlb.s8 q0, q0
2212 ; CHECK-NEXT: vmov.i32 q3, #0xff
2213 ; CHECK-NEXT: vmovlb.s16 q1, q1
2214 ; CHECK-NEXT: vand q2, q2, q3
2215 ; CHECK-NEXT: vmovlb.s16 q0, q0
2216 ; CHECK-NEXT: vpt.i32 eq, q2, zr
2217 ; CHECK-NEXT: vmlavat.u32 r0, q0, q1
2220 %c = icmp eq <4 x i8> %b, zeroinitializer
2221 %xx = sext <4 x i8> %x to <4 x i32>
2222 %yy = sext <4 x i8> %y to <4 x i32>
2223 %m = mul <4 x i32> %xx, %yy
2224 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2225 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2230 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2231 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
2232 ; CHECK: @ %bb.0: @ %entry
2233 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2234 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2235 ; CHECK-NEXT: uxth r0, r0
2238 %c = icmp eq <16 x i8> %b, zeroinitializer
2239 %xx = zext <16 x i8> %x to <16 x i16>
2240 %yy = zext <16 x i8> %y to <16 x i16>
2241 %m = mul <16 x i16> %xx, %yy
2242 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2243 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2248 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2249 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
2250 ; CHECK: @ %bb.0: @ %entry
2251 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2252 ; CHECK-NEXT: vmlavat.s8 r0, q0, q1
2253 ; CHECK-NEXT: sxth r0, r0
2256 %c = icmp eq <16 x i8> %b, zeroinitializer
2257 %xx = sext <16 x i8> %x to <16 x i16>
2258 %yy = sext <16 x i8> %y to <16 x i16>
2259 %m = mul <16 x i16> %xx, %yy
2260 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2261 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2266 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2267 ; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
2268 ; CHECK: @ %bb.0: @ %entry
2269 ; CHECK-NEXT: vmovlb.u8 q1, q1
2270 ; CHECK-NEXT: vmovlb.u8 q0, q0
2271 ; CHECK-NEXT: vmovlb.u8 q2, q2
2272 ; CHECK-NEXT: vpt.i16 eq, q2, zr
2273 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
2274 ; CHECK-NEXT: uxth r0, r0
2277 %c = icmp eq <8 x i8> %b, zeroinitializer
2278 %xx = zext <8 x i8> %x to <8 x i16>
2279 %yy = zext <8 x i8> %y to <8 x i16>
2280 %m = mul <8 x i16> %xx, %yy
2281 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2282 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2287 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2288 ; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
2289 ; CHECK: @ %bb.0: @ %entry
2290 ; CHECK-NEXT: vmovlb.s8 q1, q1
2291 ; CHECK-NEXT: vmovlb.s8 q0, q0
2292 ; CHECK-NEXT: vmovlb.u8 q2, q2
2293 ; CHECK-NEXT: vpt.i16 eq, q2, zr
2294 ; CHECK-NEXT: vmlavat.u16 r0, q0, q1
2295 ; CHECK-NEXT: sxth r0, r0
2298 %c = icmp eq <8 x i8> %b, zeroinitializer
2299 %xx = sext <8 x i8> %x to <8 x i16>
2300 %yy = sext <8 x i8> %y to <8 x i16>
2301 %m = mul <8 x i16> %xx, %yy
2302 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2303 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2308 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) {
2309 ; CHECK-LABEL: add_v16i8_v16i8_acc:
2310 ; CHECK: @ %bb.0: @ %entry
2311 ; CHECK-NEXT: vpt.i8 eq, q2, zr
2312 ; CHECK-NEXT: vmlavat.u8 r0, q0, q1
2313 ; CHECK-NEXT: uxtb r0, r0
2316 %c = icmp eq <16 x i8> %b, zeroinitializer
2317 %m = mul <16 x i8> %x, %y
2318 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
2319 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
2324 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2325 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
2326 ; CHECK: @ %bb.0: @ %entry
2327 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2328 ; CHECK-NEXT: push {r4, r5, r6, lr}
2329 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2330 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2331 ; CHECK-NEXT: .pad #32
2332 ; CHECK-NEXT: sub sp, #32
2333 ; CHECK-NEXT: vmov q4, q0
2334 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
2335 ; CHECK-NEXT: vmov.i8 q2, #0xff
2336 ; CHECK-NEXT: vmov.i8 q0, #0x0
2337 ; CHECK-NEXT: vpsel q5, q2, q0
2338 ; CHECK-NEXT: vmov q3, q2
2339 ; CHECK-NEXT: vmov.u8 r2, q5[0]
2340 ; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill
2341 ; CHECK-NEXT: vmov.16 q2[0], r2
2342 ; CHECK-NEXT: vmov.u8 r2, q5[1]
2343 ; CHECK-NEXT: vmov.16 q2[1], r2
2344 ; CHECK-NEXT: vmov.u8 r2, q5[2]
2345 ; CHECK-NEXT: vmov.16 q2[2], r2
2346 ; CHECK-NEXT: vmov.u8 r2, q5[3]
2347 ; CHECK-NEXT: vmov.16 q2[3], r2
2348 ; CHECK-NEXT: vmov.u8 r2, q5[4]
2349 ; CHECK-NEXT: vmov.16 q2[4], r2
2350 ; CHECK-NEXT: vmov.u8 r2, q5[5]
2351 ; CHECK-NEXT: vmov.16 q2[5], r2
2352 ; CHECK-NEXT: vmov.u8 r2, q5[6]
2353 ; CHECK-NEXT: vmov.16 q2[6], r2
2354 ; CHECK-NEXT: vmov.u8 r2, q5[7]
2355 ; CHECK-NEXT: vmov.16 q2[7], r2
2356 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
2357 ; CHECK-NEXT: vcmp.i16 ne, q2, zr
2358 ; CHECK-NEXT: vpsel q6, q3, q0
2359 ; CHECK-NEXT: vmov.u16 r2, q6[2]
2360 ; CHECK-NEXT: vmov.u16 r3, q6[0]
2361 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
2362 ; CHECK-NEXT: vmov.u16 r2, q6[3]
2363 ; CHECK-NEXT: vmov.u16 r3, q6[1]
2364 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
2365 ; CHECK-NEXT: vcmp.i32 ne, q2, zr
2366 ; CHECK-NEXT: vmov.i64 q2, #0xff
2367 ; CHECK-NEXT: vmrs lr, p0
2368 ; CHECK-NEXT: and r2, lr, #1
2369 ; CHECK-NEXT: ubfx r3, lr, #4, #1
2370 ; CHECK-NEXT: rsbs r2, r2, #0
2371 ; CHECK-NEXT: rsbs r3, r3, #0
2372 ; CHECK-NEXT: vmov q7[2], q7[0], r2, r3
2373 ; CHECK-NEXT: vmov q7[3], q7[1], r2, r3
2374 ; CHECK-NEXT: vmov.u8 r2, q1[1]
2375 ; CHECK-NEXT: vmov.u8 r3, q1[0]
2376 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2377 ; CHECK-NEXT: vmov.u8 r3, q4[1]
2378 ; CHECK-NEXT: vmov.u8 r2, q4[0]
2379 ; CHECK-NEXT: vand q0, q0, q2
2380 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
2381 ; CHECK-NEXT: vmov r12, s2
2382 ; CHECK-NEXT: vand q3, q3, q2
2383 ; CHECK-NEXT: vmov r3, s0
2384 ; CHECK-NEXT: vmov r2, s14
2385 ; CHECK-NEXT: vmov r4, s12
2386 ; CHECK-NEXT: umull r2, r12, r2, r12
2387 ; CHECK-NEXT: umull r3, r4, r4, r3
2388 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
2389 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r12
2390 ; CHECK-NEXT: vand q0, q0, q7
2391 ; CHECK-NEXT: vmov r2, r12, d1
2392 ; CHECK-NEXT: vmov r3, r4, d0
2393 ; CHECK-NEXT: adds r6, r3, r2
2394 ; CHECK-NEXT: ubfx r2, lr, #12, #1
2395 ; CHECK-NEXT: adc.w r12, r12, r4
2396 ; CHECK-NEXT: ubfx r4, lr, #8, #1
2397 ; CHECK-NEXT: rsbs r2, r2, #0
2398 ; CHECK-NEXT: rsbs r4, r4, #0
2399 ; CHECK-NEXT: vmov q7[2], q7[0], r4, r2
2400 ; CHECK-NEXT: vmov.u8 r3, q4[2]
2401 ; CHECK-NEXT: vmov q7[3], q7[1], r4, r2
2402 ; CHECK-NEXT: vmov.u8 r2, q1[3]
2403 ; CHECK-NEXT: vmov.u8 r4, q1[2]
2404 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
2405 ; CHECK-NEXT: vmov.u8 r4, q4[3]
2406 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r4
2407 ; CHECK-NEXT: vand q0, q0, q2
2408 ; CHECK-NEXT: vand q3, q3, q2
2409 ; CHECK-NEXT: vmov r2, s2
2410 ; CHECK-NEXT: vmov r3, s14
2411 ; CHECK-NEXT: vmov r4, s0
2412 ; CHECK-NEXT: vmov r5, s12
2413 ; CHECK-NEXT: umull r2, r3, r3, r2
2414 ; CHECK-NEXT: umull r5, r4, r5, r4
2415 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
2416 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
2417 ; CHECK-NEXT: vmov.u8 r4, q4[4]
2418 ; CHECK-NEXT: vand q0, q0, q7
2419 ; CHECK-NEXT: vmov q7, q4
2420 ; CHECK-NEXT: vmov r2, r3, d0
2421 ; CHECK-NEXT: adds r2, r2, r6
2422 ; CHECK-NEXT: vmov r6, r5, d1
2423 ; CHECK-NEXT: adc.w r3, r3, r12
2424 ; CHECK-NEXT: adds.w r12, r2, r6
2425 ; CHECK-NEXT: vmov.u16 r2, q6[6]
2426 ; CHECK-NEXT: vmov.u16 r6, q6[4]
2427 ; CHECK-NEXT: adc.w lr, r3, r5
2428 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2429 ; CHECK-NEXT: vmov.u16 r2, q6[7]
2430 ; CHECK-NEXT: vmov.u16 r6, q6[5]
2431 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
2432 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2433 ; CHECK-NEXT: vmrs r2, p0
2434 ; CHECK-NEXT: and r5, r2, #1
2435 ; CHECK-NEXT: ubfx r6, r2, #4, #1
2436 ; CHECK-NEXT: rsbs r5, r5, #0
2437 ; CHECK-NEXT: rsbs r6, r6, #0
2438 ; CHECK-NEXT: vmov q6[2], q6[0], r5, r6
2439 ; CHECK-NEXT: vmov q6[3], q6[1], r5, r6
2440 ; CHECK-NEXT: vmov.u8 r6, q1[5]
2441 ; CHECK-NEXT: vmov.u8 r5, q1[4]
2442 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
2443 ; CHECK-NEXT: vmov.u8 r5, q4[5]
2444 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r5
2445 ; CHECK-NEXT: vand q0, q0, q2
2446 ; CHECK-NEXT: vand q3, q3, q2
2447 ; CHECK-NEXT: vmov r6, s2
2448 ; CHECK-NEXT: vmov r5, s14
2449 ; CHECK-NEXT: vmov r4, s0
2450 ; CHECK-NEXT: vmov r3, s12
2451 ; CHECK-NEXT: umull r6, r5, r5, r6
2452 ; CHECK-NEXT: umull r3, r4, r3, r4
2453 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
2454 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2455 ; CHECK-NEXT: vand q0, q0, q6
2456 ; CHECK-NEXT: vmov r3, r4, d0
2457 ; CHECK-NEXT: adds.w r3, r3, r12
2458 ; CHECK-NEXT: adc.w r6, lr, r4
2459 ; CHECK-NEXT: vmov r5, r4, d1
2460 ; CHECK-NEXT: adds r3, r3, r5
2461 ; CHECK-NEXT: vmov.u8 r5, q4[6]
2462 ; CHECK-NEXT: adc.w r12, r6, r4
2463 ; CHECK-NEXT: ubfx r6, r2, #12, #1
2464 ; CHECK-NEXT: ubfx r2, r2, #8, #1
2465 ; CHECK-NEXT: rsbs r6, r6, #0
2466 ; CHECK-NEXT: rsbs r2, r2, #0
2467 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r6
2468 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r6
2469 ; CHECK-NEXT: vmov.u8 r2, q1[7]
2470 ; CHECK-NEXT: vmov.u8 r6, q1[6]
2471 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2472 ; CHECK-NEXT: vmov.u8 r6, q4[7]
2473 ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
2474 ; CHECK-NEXT: vand q0, q0, q2
2475 ; CHECK-NEXT: vand q3, q3, q2
2476 ; CHECK-NEXT: vmov r2, s2
2477 ; CHECK-NEXT: vmov r6, s14
2478 ; CHECK-NEXT: vmov r4, s12
2479 ; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
2480 ; CHECK-NEXT: vmov r5, s0
2481 ; CHECK-NEXT: umull r2, r6, r6, r2
2482 ; CHECK-NEXT: umull r5, r4, r4, r5
2483 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
2484 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
2485 ; CHECK-NEXT: vmov.u8 r4, q7[8]
2486 ; CHECK-NEXT: vand q0, q0, q6
2487 ; CHECK-NEXT: vmov r2, r6, d0
2488 ; CHECK-NEXT: adds r2, r2, r3
2489 ; CHECK-NEXT: adc.w r3, r12, r6
2490 ; CHECK-NEXT: vmov r6, r5, d1
2491 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2492 ; CHECK-NEXT: adds.w r12, r2, r6
2493 ; CHECK-NEXT: vmov.u8 r2, q5[8]
2494 ; CHECK-NEXT: vmov.16 q6[0], r2
2495 ; CHECK-NEXT: vmov.u8 r2, q5[9]
2496 ; CHECK-NEXT: vmov.16 q6[1], r2
2497 ; CHECK-NEXT: vmov.u8 r2, q5[10]
2498 ; CHECK-NEXT: vmov.16 q6[2], r2
2499 ; CHECK-NEXT: vmov.u8 r2, q5[11]
2500 ; CHECK-NEXT: vmov.16 q6[3], r2
2501 ; CHECK-NEXT: vmov.u8 r2, q5[12]
2502 ; CHECK-NEXT: vmov.16 q6[4], r2
2503 ; CHECK-NEXT: vmov.u8 r2, q5[13]
2504 ; CHECK-NEXT: vmov.16 q6[5], r2
2505 ; CHECK-NEXT: vmov.u8 r2, q5[14]
2506 ; CHECK-NEXT: vmov.16 q6[6], r2
2507 ; CHECK-NEXT: vmov.u8 r2, q5[15]
2508 ; CHECK-NEXT: vmov.16 q6[7], r2
2509 ; CHECK-NEXT: adc.w lr, r3, r5
2510 ; CHECK-NEXT: vcmp.i16 ne, q6, zr
2511 ; CHECK-NEXT: vpsel q3, q3, q0
2512 ; CHECK-NEXT: vmov.u16 r2, q3[2]
2513 ; CHECK-NEXT: vmov.u16 r6, q3[0]
2514 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2515 ; CHECK-NEXT: vmov.u16 r2, q3[3]
2516 ; CHECK-NEXT: vmov.u16 r6, q3[1]
2517 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
2518 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2519 ; CHECK-NEXT: vmrs r2, p0
2520 ; CHECK-NEXT: and r5, r2, #1
2521 ; CHECK-NEXT: ubfx r6, r2, #4, #1
2522 ; CHECK-NEXT: rsbs r5, r5, #0
2523 ; CHECK-NEXT: rsbs r6, r6, #0
2524 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r6
2525 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r6
2526 ; CHECK-NEXT: vmov.u8 r6, q1[9]
2527 ; CHECK-NEXT: vmov.u8 r5, q1[8]
2528 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
2529 ; CHECK-NEXT: vmov.u8 r5, q7[9]
2530 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r5
2531 ; CHECK-NEXT: vand q0, q0, q2
2532 ; CHECK-NEXT: vand q5, q5, q2
2533 ; CHECK-NEXT: vmov r6, s2
2534 ; CHECK-NEXT: vmov r5, s22
2535 ; CHECK-NEXT: vmov r4, s0
2536 ; CHECK-NEXT: vmov r3, s20
2537 ; CHECK-NEXT: umull r6, r5, r5, r6
2538 ; CHECK-NEXT: umull r3, r4, r3, r4
2539 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
2540 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2541 ; CHECK-NEXT: vand q0, q0, q4
2542 ; CHECK-NEXT: vmov r3, r4, d0
2543 ; CHECK-NEXT: adds.w r3, r3, r12
2544 ; CHECK-NEXT: adc.w r6, lr, r4
2545 ; CHECK-NEXT: vmov r5, r4, d1
2546 ; CHECK-NEXT: adds r3, r3, r5
2547 ; CHECK-NEXT: vmov.u8 r5, q7[10]
2548 ; CHECK-NEXT: adc.w r12, r6, r4
2549 ; CHECK-NEXT: ubfx r6, r2, #12, #1
2550 ; CHECK-NEXT: ubfx r2, r2, #8, #1
2551 ; CHECK-NEXT: rsbs r6, r6, #0
2552 ; CHECK-NEXT: rsbs r2, r2, #0
2553 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r6
2554 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r6
2555 ; CHECK-NEXT: vmov.u8 r2, q1[11]
2556 ; CHECK-NEXT: vmov.u8 r6, q1[10]
2557 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2558 ; CHECK-NEXT: vmov.u8 r6, q7[11]
2559 ; CHECK-NEXT: vmov q5[2], q5[0], r5, r6
2560 ; CHECK-NEXT: vand q0, q0, q2
2561 ; CHECK-NEXT: vand q5, q5, q2
2562 ; CHECK-NEXT: vmov r2, s2
2563 ; CHECK-NEXT: vmov r6, s22
2564 ; CHECK-NEXT: vmov r5, s0
2565 ; CHECK-NEXT: vmov r4, s20
2566 ; CHECK-NEXT: umull r2, r6, r6, r2
2567 ; CHECK-NEXT: umull r5, r4, r4, r5
2568 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
2569 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
2570 ; CHECK-NEXT: vmov.u8 r4, q7[12]
2571 ; CHECK-NEXT: vand q0, q0, q4
2572 ; CHECK-NEXT: vmov r2, r6, d0
2573 ; CHECK-NEXT: adds r2, r2, r3
2574 ; CHECK-NEXT: adc.w r3, r12, r6
2575 ; CHECK-NEXT: vmov r6, r5, d1
2576 ; CHECK-NEXT: adds.w r12, r2, r6
2577 ; CHECK-NEXT: vmov.u16 r2, q3[6]
2578 ; CHECK-NEXT: vmov.u16 r6, q3[4]
2579 ; CHECK-NEXT: adc.w lr, r3, r5
2580 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2581 ; CHECK-NEXT: vmov.u16 r2, q3[7]
2582 ; CHECK-NEXT: vmov.u16 r6, q3[5]
2583 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
2584 ; CHECK-NEXT: vcmp.i32 ne, q0, zr
2585 ; CHECK-NEXT: vmrs r2, p0
2586 ; CHECK-NEXT: and r5, r2, #1
2587 ; CHECK-NEXT: ubfx r6, r2, #4, #1
2588 ; CHECK-NEXT: rsbs r5, r5, #0
2589 ; CHECK-NEXT: rsbs r6, r6, #0
2590 ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
2591 ; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
2592 ; CHECK-NEXT: vmov.u8 r6, q1[13]
2593 ; CHECK-NEXT: vmov.u8 r5, q1[12]
2594 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
2595 ; CHECK-NEXT: vmov.u8 r5, q7[13]
2596 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5
2597 ; CHECK-NEXT: vand q0, q0, q2
2598 ; CHECK-NEXT: vand q4, q4, q2
2599 ; CHECK-NEXT: vmov r6, s2
2600 ; CHECK-NEXT: vmov r5, s18
2601 ; CHECK-NEXT: vmov r4, s0
2602 ; CHECK-NEXT: vmov r3, s16
2603 ; CHECK-NEXT: umull r6, r5, r5, r6
2604 ; CHECK-NEXT: umull r3, r4, r3, r4
2605 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
2606 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2607 ; CHECK-NEXT: vand q0, q0, q3
2608 ; CHECK-NEXT: vmov r3, r4, d0
2609 ; CHECK-NEXT: adds.w r3, r3, r12
2610 ; CHECK-NEXT: adc.w r6, lr, r4
2611 ; CHECK-NEXT: vmov r5, r4, d1
2612 ; CHECK-NEXT: adds r3, r3, r5
2613 ; CHECK-NEXT: vmov.u8 r5, q7[14]
2614 ; CHECK-NEXT: adc.w r12, r6, r4
2615 ; CHECK-NEXT: ubfx r6, r2, #12, #1
2616 ; CHECK-NEXT: ubfx r2, r2, #8, #1
2617 ; CHECK-NEXT: rsbs r6, r6, #0
2618 ; CHECK-NEXT: rsbs r2, r2, #0
2619 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r6
2620 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r6
2621 ; CHECK-NEXT: vmov.u8 r2, q1[15]
2622 ; CHECK-NEXT: vmov.u8 r6, q1[14]
2623 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
2624 ; CHECK-NEXT: vmov.u8 r6, q7[15]
2625 ; CHECK-NEXT: vmov q1[2], q1[0], r5, r6
2626 ; CHECK-NEXT: vand q0, q0, q2
2627 ; CHECK-NEXT: vand q1, q1, q2
2628 ; CHECK-NEXT: vmov r2, s2
2629 ; CHECK-NEXT: vmov r6, s6
2630 ; CHECK-NEXT: vmov r5, s0
2631 ; CHECK-NEXT: vmov r4, s4
2632 ; CHECK-NEXT: umull r2, r6, r6, r2
2633 ; CHECK-NEXT: umull r5, r4, r4, r5
2634 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
2635 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
2636 ; CHECK-NEXT: vand q0, q0, q3
2637 ; CHECK-NEXT: vmov r2, r6, d0
2638 ; CHECK-NEXT: adds r2, r2, r3
2639 ; CHECK-NEXT: adc.w r3, r12, r6
2640 ; CHECK-NEXT: vmov r6, r5, d1
2641 ; CHECK-NEXT: adds r2, r2, r6
2642 ; CHECK-NEXT: adcs r3, r5
2643 ; CHECK-NEXT: adds r0, r0, r2
2644 ; CHECK-NEXT: adcs r1, r3
2645 ; CHECK-NEXT: add sp, #32
2646 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2647 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2649 %c = icmp eq <16 x i8> %b, zeroinitializer
2650 %xx = zext <16 x i8> %x to <16 x i64>
2651 %yy = zext <16 x i8> %y to <16 x i64>
2652 %m = mul <16 x i64> %xx, %yy
2653 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2654 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2659 define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2660 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
2661 ; CHECK: @ %bb.0: @ %entry
2662 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2663 ; CHECK-NEXT: push {r4, r5, r6, lr}
2664 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2665 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2666 ; CHECK-NEXT: vcmp.i8 eq, q2, zr
2667 ; CHECK-NEXT: vmov.i8 q2, #0x0
2668 ; CHECK-NEXT: vmov.i8 q3, #0xff
2669 ; CHECK-NEXT: vmov.s8 r4, q0[0]
2670 ; CHECK-NEXT: vpsel q4, q3, q2
2671 ; CHECK-NEXT: vmov.s8 r5, q0[2]
2672 ; CHECK-NEXT: vmov.u8 r2, q4[0]
2673 ; CHECK-NEXT: vmov.16 q5[0], r2
2674 ; CHECK-NEXT: vmov.u8 r2, q4[1]
2675 ; CHECK-NEXT: vmov.16 q5[1], r2
2676 ; CHECK-NEXT: vmov.u8 r2, q4[2]
2677 ; CHECK-NEXT: vmov.16 q5[2], r2
2678 ; CHECK-NEXT: vmov.u8 r2, q4[3]
2679 ; CHECK-NEXT: vmov.16 q5[3], r2
2680 ; CHECK-NEXT: vmov.u8 r2, q4[4]
2681 ; CHECK-NEXT: vmov.16 q5[4], r2
2682 ; CHECK-NEXT: vmov.u8 r2, q4[5]
2683 ; CHECK-NEXT: vmov.16 q5[5], r2
2684 ; CHECK-NEXT: vmov.u8 r2, q4[6]
2685 ; CHECK-NEXT: vmov.16 q5[6], r2
2686 ; CHECK-NEXT: vmov.u8 r2, q4[7]
2687 ; CHECK-NEXT: vmov.16 q5[7], r2
2688 ; CHECK-NEXT: vcmp.i16 ne, q5, zr
2689 ; CHECK-NEXT: vpsel q5, q3, q2
2690 ; CHECK-NEXT: vmov.u16 r2, q5[2]
2691 ; CHECK-NEXT: vmov.u16 r3, q5[0]
2692 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2
2693 ; CHECK-NEXT: vmov.u16 r2, q5[3]
2694 ; CHECK-NEXT: vmov.u16 r3, q5[1]
2695 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
2696 ; CHECK-NEXT: vcmp.i32 ne, q6, zr
2697 ; CHECK-NEXT: vmrs r12, p0
2698 ; CHECK-NEXT: and r2, r12, #1
2699 ; CHECK-NEXT: ubfx r3, r12, #4, #1
2700 ; CHECK-NEXT: rsbs r2, r2, #0
2701 ; CHECK-NEXT: rsbs r3, r3, #0
2702 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3
2703 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3
2704 ; CHECK-NEXT: vmov.s8 r2, q1[1]
2705 ; CHECK-NEXT: vmov.s8 r3, q0[1]
2706 ; CHECK-NEXT: smull r2, lr, r3, r2
2707 ; CHECK-NEXT: vmov.s8 r3, q1[0]
2708 ; CHECK-NEXT: smull r3, r4, r4, r3
2709 ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2
2710 ; CHECK-NEXT: vmov q7[3], q7[1], r4, lr
2711 ; CHECK-NEXT: vand q6, q7, q6
2712 ; CHECK-NEXT: vmov r2, lr, d13
2713 ; CHECK-NEXT: vmov r4, r3, d12
2714 ; CHECK-NEXT: adds r6, r4, r2
2715 ; CHECK-NEXT: ubfx r4, r12, #12, #1
2716 ; CHECK-NEXT: ubfx r2, r12, #8, #1
2717 ; CHECK-NEXT: rsb.w r4, r4, #0
2718 ; CHECK-NEXT: rsb.w r2, r2, #0
2719 ; CHECK-NEXT: adc.w lr, lr, r3
2720 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r4
2721 ; CHECK-NEXT: vmov.s8 r3, q1[2]
2722 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r4
2723 ; CHECK-NEXT: vmov.s8 r2, q1[3]
2724 ; CHECK-NEXT: vmov.s8 r4, q0[3]
2725 ; CHECK-NEXT: smull r3, r5, r5, r3
2726 ; CHECK-NEXT: smull r2, r4, r4, r2
2727 ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2
2728 ; CHECK-NEXT: vmov q7[3], q7[1], r5, r4
2729 ; CHECK-NEXT: vand q6, q7, q6
2730 ; CHECK-NEXT: vmov r2, r3, d12
2731 ; CHECK-NEXT: adds r2, r2, r6
2732 ; CHECK-NEXT: vmov r6, r5, d13
2733 ; CHECK-NEXT: adc.w r3, r3, lr
2734 ; CHECK-NEXT: adds.w r12, r2, r6
2735 ; CHECK-NEXT: vmov.u16 r6, q5[6]
2736 ; CHECK-NEXT: adc.w lr, r3, r5
2737 ; CHECK-NEXT: vmov.u16 r5, q5[4]
2738 ; CHECK-NEXT: vmov q6[2], q6[0], r5, r6
2739 ; CHECK-NEXT: vmov.u16 r6, q5[7]
2740 ; CHECK-NEXT: vmov.u16 r5, q5[5]
2741 ; CHECK-NEXT: vmov.s8 r2, q1[4]
2742 ; CHECK-NEXT: vmov q6[3], q6[1], r5, r6
2743 ; CHECK-NEXT: vmov.s8 r3, q0[4]
2744 ; CHECK-NEXT: vcmp.i32 ne, q6, zr
2745 ; CHECK-NEXT: smull r2, r3, r3, r2
2746 ; CHECK-NEXT: vmrs r6, p0
2747 ; CHECK-NEXT: and r4, r6, #1
2748 ; CHECK-NEXT: ubfx r5, r6, #4, #1
2749 ; CHECK-NEXT: rsbs r4, r4, #0
2750 ; CHECK-NEXT: rsbs r5, r5, #0
2751 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r5
2752 ; CHECK-NEXT: vmov q5[3], q5[1], r4, r5
2753 ; CHECK-NEXT: vmov.s8 r5, q1[5]
2754 ; CHECK-NEXT: vmov.s8 r4, q0[5]
2755 ; CHECK-NEXT: smull r5, r4, r4, r5
2756 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r5
2757 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r4
2758 ; CHECK-NEXT: vand q5, q6, q5
2759 ; CHECK-NEXT: vmov r2, r3, d10
2760 ; CHECK-NEXT: vmov r5, r4, d11
2761 ; CHECK-NEXT: adds.w r2, r2, r12
2762 ; CHECK-NEXT: adc.w r3, r3, lr
2763 ; CHECK-NEXT: adds.w r12, r2, r5
2764 ; CHECK-NEXT: ubfx r5, r6, #12, #1
2765 ; CHECK-NEXT: ubfx r6, r6, #8, #1
2766 ; CHECK-NEXT: rsb.w r5, r5, #0
2767 ; CHECK-NEXT: rsb.w r6, r6, #0
2768 ; CHECK-NEXT: vmov q5[2], q5[0], r6, r5
2769 ; CHECK-NEXT: adcs r3, r4
2770 ; CHECK-NEXT: vmov q5[3], q5[1], r6, r5
2771 ; CHECK-NEXT: vmov.s8 r6, q1[7]
2772 ; CHECK-NEXT: vmov.s8 r5, q0[7]
2773 ; CHECK-NEXT: vmov.s8 r4, q1[6]
2774 ; CHECK-NEXT: vmov.s8 r2, q0[6]
2775 ; CHECK-NEXT: smull r6, r5, r5, r6
2776 ; CHECK-NEXT: smull r2, r4, r2, r4
2777 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r6
2778 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r5
2779 ; CHECK-NEXT: vand q5, q6, q5
2780 ; CHECK-NEXT: vmov r2, r6, d10
2781 ; CHECK-NEXT: adds.w r2, r2, r12
2782 ; CHECK-NEXT: adcs r3, r6
2783 ; CHECK-NEXT: vmov r6, r5, d11
2784 ; CHECK-NEXT: adds.w r12, r2, r6
2785 ; CHECK-NEXT: vmov.u8 r6, q4[8]
2786 ; CHECK-NEXT: vmov.16 q5[0], r6
2787 ; CHECK-NEXT: vmov.u8 r6, q4[9]
2788 ; CHECK-NEXT: vmov.16 q5[1], r6
2789 ; CHECK-NEXT: vmov.u8 r6, q4[10]
2790 ; CHECK-NEXT: vmov.16 q5[2], r6
2791 ; CHECK-NEXT: vmov.u8 r6, q4[11]
2792 ; CHECK-NEXT: vmov.16 q5[3], r6
2793 ; CHECK-NEXT: vmov.u8 r6, q4[12]
2794 ; CHECK-NEXT: vmov.16 q5[4], r6
2795 ; CHECK-NEXT: vmov.u8 r6, q4[13]
2796 ; CHECK-NEXT: vmov.16 q5[5], r6
2797 ; CHECK-NEXT: vmov.u8 r6, q4[14]
2798 ; CHECK-NEXT: vmov.16 q5[6], r6
2799 ; CHECK-NEXT: vmov.u8 r6, q4[15]
2800 ; CHECK-NEXT: vmov.16 q5[7], r6
2801 ; CHECK-NEXT: adc.w lr, r3, r5
2802 ; CHECK-NEXT: vcmp.i16 ne, q5, zr
2803 ; CHECK-NEXT: vmov.s8 r2, q1[8]
2804 ; CHECK-NEXT: vpsel q2, q3, q2
2805 ; CHECK-NEXT: vmov.s8 r3, q0[8]
2806 ; CHECK-NEXT: vmov.u16 r6, q2[2]
2807 ; CHECK-NEXT: vmov.u16 r5, q2[0]
2808 ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
2809 ; CHECK-NEXT: vmov.u16 r6, q2[3]
2810 ; CHECK-NEXT: vmov.u16 r5, q2[1]
2811 ; CHECK-NEXT: smull r2, r3, r3, r2
2812 ; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
2813 ; CHECK-NEXT: vcmp.i32 ne, q3, zr
2814 ; CHECK-NEXT: vmrs r6, p0
2815 ; CHECK-NEXT: and r4, r6, #1
2816 ; CHECK-NEXT: ubfx r5, r6, #4, #1
2817 ; CHECK-NEXT: rsbs r4, r4, #0
2818 ; CHECK-NEXT: rsbs r5, r5, #0
2819 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r5
2820 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5
2821 ; CHECK-NEXT: vmov.s8 r5, q1[9]
2822 ; CHECK-NEXT: vmov.s8 r4, q0[9]
2823 ; CHECK-NEXT: smull r5, r4, r4, r5
2824 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r5
2825 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r4
2826 ; CHECK-NEXT: vand q3, q4, q3
2827 ; CHECK-NEXT: vmov r2, r3, d6
2828 ; CHECK-NEXT: vmov r5, r4, d7
2829 ; CHECK-NEXT: adds.w r2, r2, r12
2830 ; CHECK-NEXT: adc.w r3, r3, lr
2831 ; CHECK-NEXT: adds.w r12, r2, r5
2832 ; CHECK-NEXT: ubfx r5, r6, #12, #1
2833 ; CHECK-NEXT: ubfx r6, r6, #8, #1
2834 ; CHECK-NEXT: rsb.w r5, r5, #0
2835 ; CHECK-NEXT: rsb.w r6, r6, #0
2836 ; CHECK-NEXT: vmov q3[2], q3[0], r6, r5
2837 ; CHECK-NEXT: adcs r3, r4
2838 ; CHECK-NEXT: vmov q3[3], q3[1], r6, r5
2839 ; CHECK-NEXT: vmov.s8 r6, q1[11]
2840 ; CHECK-NEXT: vmov.s8 r5, q0[11]
2841 ; CHECK-NEXT: vmov.s8 r4, q1[10]
2842 ; CHECK-NEXT: vmov.s8 r2, q0[10]
2843 ; CHECK-NEXT: smull r6, r5, r5, r6
2844 ; CHECK-NEXT: smull r2, r4, r2, r4
2845 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r6
2846 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5
2847 ; CHECK-NEXT: vand q3, q4, q3
2848 ; CHECK-NEXT: vmov r2, r6, d6
2849 ; CHECK-NEXT: adds.w r2, r2, r12
2850 ; CHECK-NEXT: adcs r3, r6
2851 ; CHECK-NEXT: vmov r6, r5, d7
2852 ; CHECK-NEXT: adds.w r12, r2, r6
2853 ; CHECK-NEXT: vmov.u16 r6, q2[6]
2854 ; CHECK-NEXT: adc.w lr, r3, r5
2855 ; CHECK-NEXT: vmov.u16 r5, q2[4]
2856 ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
2857 ; CHECK-NEXT: vmov.u16 r6, q2[7]
2858 ; CHECK-NEXT: vmov.u16 r5, q2[5]
2859 ; CHECK-NEXT: vmov.s8 r2, q1[12]
2860 ; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
2861 ; CHECK-NEXT: vmov.s8 r3, q0[12]
2862 ; CHECK-NEXT: vcmp.i32 ne, q3, zr
2863 ; CHECK-NEXT: smull r2, r3, r3, r2
2864 ; CHECK-NEXT: vmrs r6, p0
2865 ; CHECK-NEXT: and r4, r6, #1
2866 ; CHECK-NEXT: ubfx r5, r6, #4, #1
2867 ; CHECK-NEXT: rsbs r4, r4, #0
2868 ; CHECK-NEXT: rsbs r5, r5, #0
2869 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r5
2870 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
2871 ; CHECK-NEXT: vmov.s8 r5, q1[13]
2872 ; CHECK-NEXT: vmov.s8 r4, q0[13]
2873 ; CHECK-NEXT: smull r5, r4, r4, r5
2874 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r5
2875 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r4
2876 ; CHECK-NEXT: vand q2, q3, q2
2877 ; CHECK-NEXT: vmov r2, r3, d4
2878 ; CHECK-NEXT: vmov r5, r4, d5
2879 ; CHECK-NEXT: adds.w r2, r2, r12
2880 ; CHECK-NEXT: adc.w r3, r3, lr
2881 ; CHECK-NEXT: adds.w r12, r2, r5
2882 ; CHECK-NEXT: ubfx r5, r6, #12, #1
2883 ; CHECK-NEXT: ubfx r6, r6, #8, #1
2884 ; CHECK-NEXT: rsb.w r5, r5, #0
2885 ; CHECK-NEXT: rsb.w r6, r6, #0
2886 ; CHECK-NEXT: vmov q2[2], q2[0], r6, r5
2887 ; CHECK-NEXT: adcs r3, r4
2888 ; CHECK-NEXT: vmov q2[3], q2[1], r6, r5
2889 ; CHECK-NEXT: vmov.s8 r6, q1[15]
2890 ; CHECK-NEXT: vmov.s8 r5, q0[15]
2891 ; CHECK-NEXT: vmov.s8 r4, q1[14]
2892 ; CHECK-NEXT: vmov.s8 r2, q0[14]
2893 ; CHECK-NEXT: smull r6, r5, r5, r6
2894 ; CHECK-NEXT: smull r2, r4, r2, r4
2895 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r6
2896 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
2897 ; CHECK-NEXT: vand q0, q0, q2
2898 ; CHECK-NEXT: vmov r2, r6, d0
2899 ; CHECK-NEXT: adds.w r2, r2, r12
2900 ; CHECK-NEXT: adcs r3, r6
2901 ; CHECK-NEXT: vmov r6, r5, d1
2902 ; CHECK-NEXT: adds r2, r2, r6
2903 ; CHECK-NEXT: adcs r3, r5
2904 ; CHECK-NEXT: adds r0, r0, r2
2905 ; CHECK-NEXT: adcs r1, r3
2906 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2907 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2909 %c = icmp eq <16 x i8> %b, zeroinitializer
2910 %xx = sext <16 x i8> %x to <16 x i64>
2911 %yy = sext <16 x i8> %y to <16 x i64>
2912 %m = mul <16 x i64> %xx, %yy
2913 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2914 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2919 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2920 ; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
2921 ; CHECK: @ %bb.0: @ %entry
2922 ; CHECK-NEXT: .save {r7, lr}
2923 ; CHECK-NEXT: push {r7, lr}
2924 ; CHECK-NEXT: vmov.i64 q3, #0xff
2925 ; CHECK-NEXT: vand q1, q1, q3
2926 ; CHECK-NEXT: vand q0, q0, q3
2927 ; CHECK-NEXT: vmov r2, s6
2928 ; CHECK-NEXT: vmov r3, s2
2929 ; CHECK-NEXT: umull lr, r12, r3, r2
2930 ; CHECK-NEXT: vmov r3, s4
2931 ; CHECK-NEXT: vmov r2, s0
2932 ; CHECK-NEXT: vand q1, q2, q3
2933 ; CHECK-NEXT: umull r2, r3, r2, r3
2934 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2935 ; CHECK-NEXT: vmov r2, s6
2936 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2937 ; CHECK-NEXT: vmov r3, s4
2938 ; CHECK-NEXT: cmp r2, #0
2939 ; CHECK-NEXT: cset r2, eq
2940 ; CHECK-NEXT: cmp r2, #0
2941 ; CHECK-NEXT: csetm r2, ne
2942 ; CHECK-NEXT: cmp r3, #0
2943 ; CHECK-NEXT: cset r3, eq
2944 ; CHECK-NEXT: cmp r3, #0
2945 ; CHECK-NEXT: csetm r3, ne
2946 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
2947 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
2948 ; CHECK-NEXT: vand q0, q0, q1
2949 ; CHECK-NEXT: vmov lr, r12, d1
2950 ; CHECK-NEXT: vmov r3, r2, d0
2951 ; CHECK-NEXT: adds.w r3, r3, lr
2952 ; CHECK-NEXT: adc.w r2, r2, r12
2953 ; CHECK-NEXT: adds r0, r0, r3
2954 ; CHECK-NEXT: adcs r1, r2
2955 ; CHECK-NEXT: pop {r7, pc}
2957 %c = icmp eq <2 x i8> %b, zeroinitializer
2958 %xx = zext <2 x i8> %x to <2 x i64>
2959 %yy = zext <2 x i8> %y to <2 x i64>
2960 %m = mul <2 x i64> %xx, %yy
2961 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2962 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2967 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2968 ; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
2969 ; CHECK: @ %bb.0: @ %entry
2970 ; CHECK-NEXT: .save {r7, lr}
2971 ; CHECK-NEXT: push {r7, lr}
2972 ; CHECK-NEXT: vmov.i32 q3, #0xff
2973 ; CHECK-NEXT: vand q2, q2, q3
2974 ; CHECK-NEXT: vmov r2, s10
2975 ; CHECK-NEXT: vmov r3, s8
2976 ; CHECK-NEXT: cmp r2, #0
2977 ; CHECK-NEXT: cset r2, eq
2978 ; CHECK-NEXT: cmp r2, #0
2979 ; CHECK-NEXT: csetm r2, ne
2980 ; CHECK-NEXT: cmp r3, #0
2981 ; CHECK-NEXT: cset r3, eq
2982 ; CHECK-NEXT: cmp r3, #0
2983 ; CHECK-NEXT: csetm r3, ne
2984 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
2985 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
2986 ; CHECK-NEXT: vmov r2, s6
2987 ; CHECK-NEXT: vmov r3, s2
2988 ; CHECK-NEXT: sxtb r2, r2
2989 ; CHECK-NEXT: sxtb r3, r3
2990 ; CHECK-NEXT: smull lr, r12, r3, r2
2991 ; CHECK-NEXT: vmov r3, s4
2992 ; CHECK-NEXT: vmov r2, s0
2993 ; CHECK-NEXT: sxtb r3, r3
2994 ; CHECK-NEXT: sxtb r2, r2
2995 ; CHECK-NEXT: smull r2, r3, r2, r3
2996 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
2997 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
2998 ; CHECK-NEXT: vand q0, q0, q2
2999 ; CHECK-NEXT: vmov lr, r12, d1
3000 ; CHECK-NEXT: vmov r3, r2, d0
3001 ; CHECK-NEXT: adds.w r3, r3, lr
3002 ; CHECK-NEXT: adc.w r2, r2, r12
3003 ; CHECK-NEXT: adds r0, r0, r3
3004 ; CHECK-NEXT: adcs r1, r2
3005 ; CHECK-NEXT: pop {r7, pc}
3007 %c = icmp eq <2 x i8> %b, zeroinitializer
3008 %xx = sext <2 x i8> %x to <2 x i64>
3009 %yy = sext <2 x i8> %y to <2 x i64>
3010 %m = mul <2 x i64> %xx, %yy
3011 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3012 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3017 define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
3018 ; CHECK-LABEL: add_v2i64_v2i64_acc:
3019 ; CHECK: @ %bb.0: @ %entry
3020 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
3021 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
3022 ; CHECK-NEXT: vmov r2, r12, d3
3023 ; CHECK-NEXT: vmov r3, lr, d1
3024 ; CHECK-NEXT: vmov r6, r9, d2
3025 ; CHECK-NEXT: vmov r5, r11, d0
3026 ; CHECK-NEXT: umull r10, r8, r3, r2
3027 ; CHECK-NEXT: umull r4, r7, r5, r6
3028 ; CHECK-NEXT: mla r3, r3, r12, r8
3029 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r10
3030 ; CHECK-NEXT: mla r2, lr, r2, r3
3031 ; CHECK-NEXT: mla r3, r5, r9, r7
3032 ; CHECK-NEXT: mla r3, r11, r6, r3
3033 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
3034 ; CHECK-NEXT: vmov r2, r3, d5
3035 ; CHECK-NEXT: orrs r2, r3
3036 ; CHECK-NEXT: vmov r3, r7, d4
3037 ; CHECK-NEXT: cset r2, eq
3038 ; CHECK-NEXT: cmp r2, #0
3039 ; CHECK-NEXT: csetm r2, ne
3040 ; CHECK-NEXT: orrs r3, r7
3041 ; CHECK-NEXT: cset r3, eq
3042 ; CHECK-NEXT: cmp r3, #0
3043 ; CHECK-NEXT: csetm r3, ne
3044 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
3045 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
3046 ; CHECK-NEXT: vand q0, q0, q1
3047 ; CHECK-NEXT: vmov r2, r3, d1
3048 ; CHECK-NEXT: vmov r7, r6, d0
3049 ; CHECK-NEXT: adds r2, r2, r7
3050 ; CHECK-NEXT: adcs r3, r6
3051 ; CHECK-NEXT: adds r0, r0, r2
3052 ; CHECK-NEXT: adcs r1, r3
3053 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
3055 %c = icmp eq <2 x i64> %b, zeroinitializer
3056 %m = mul <2 x i64> %x, %y
3057 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3058 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3063 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3064 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3065 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3066 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3067 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3068 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
3069 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
3070 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
3071 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
3072 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)