1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
3 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
5 define i32 @add_v4i32_v4i32(<4 x i32> %x) {
6 ; CHECK-LABEL: add_v4i32_v4i32:
7 ; CHECK: // %bb.0: // %entry
8 ; CHECK-NEXT: addv s0, v0.4s
9 ; CHECK-NEXT: fmov w0, s0
12 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
16 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
17 ; CHECK-LABEL: add_v4i32_v4i64_zext:
18 ; CHECK: // %bb.0: // %entry
19 ; CHECK-NEXT: uaddlv d0, v0.4s
20 ; CHECK-NEXT: fmov x0, d0
23 %xx = zext <4 x i32> %x to <4 x i64>
24 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
28 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
29 ; CHECK-LABEL: add_v4i32_v4i64_sext:
30 ; CHECK: // %bb.0: // %entry
31 ; CHECK-NEXT: saddlv d0, v0.4s
32 ; CHECK-NEXT: fmov x0, d0
35 %xx = sext <4 x i32> %x to <4 x i64>
36 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
40 define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
41 ; CHECK-LABEL: add_v2i32_v2i64_zext:
42 ; CHECK: // %bb.0: // %entry
43 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
44 ; CHECK-NEXT: addp d0, v0.2d
45 ; CHECK-NEXT: fmov x0, d0
48 %xx = zext <2 x i32> %x to <2 x i64>
49 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
53 define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
54 ; CHECK-LABEL: add_v2i32_v2i64_sext:
55 ; CHECK: // %bb.0: // %entry
56 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
57 ; CHECK-NEXT: addp d0, v0.2d
58 ; CHECK-NEXT: fmov x0, d0
61 %xx = sext <2 x i32> %x to <2 x i64>
62 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
66 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
67 ; CHECK-LABEL: add_v8i16_v8i32_zext:
68 ; CHECK: // %bb.0: // %entry
69 ; CHECK-NEXT: uaddlv s0, v0.8h
70 ; CHECK-NEXT: fmov w0, s0
73 %xx = zext <8 x i16> %x to <8 x i32>
74 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
78 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
79 ; CHECK-LABEL: add_v8i16_v8i32_sext:
80 ; CHECK: // %bb.0: // %entry
81 ; CHECK-NEXT: saddlv s0, v0.8h
82 ; CHECK-NEXT: fmov w0, s0
85 %xx = sext <8 x i16> %x to <8 x i32>
86 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
90 define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
91 ; CHECK-LABEL: add_v4i16_v4i32_zext:
92 ; CHECK: // %bb.0: // %entry
93 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
94 ; CHECK-NEXT: addv s0, v0.4s
95 ; CHECK-NEXT: fmov w0, s0
98 %xx = zext <4 x i16> %x to <4 x i32>
99 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
103 define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
104 ; CHECK-LABEL: add_v4i16_v4i32_sext:
105 ; CHECK: // %bb.0: // %entry
106 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
107 ; CHECK-NEXT: addv s0, v0.4s
108 ; CHECK-NEXT: fmov w0, s0
111 %xx = sext <4 x i16> %x to <4 x i32>
112 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
116 define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
117 ; CHECK-LABEL: add_v8i16_v8i16:
118 ; CHECK: // %bb.0: // %entry
119 ; CHECK-NEXT: addv h0, v0.8h
120 ; CHECK-NEXT: fmov w0, s0
123 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
127 define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
128 ; CHECK-LABEL: add_v8i16_v8i64_zext:
129 ; CHECK: // %bb.0: // %entry
130 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
131 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
132 ; CHECK-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
133 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
134 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
135 ; CHECK-NEXT: addp d0, v0.2d
136 ; CHECK-NEXT: fmov x0, d0
139 %xx = zext <8 x i16> %x to <8 x i64>
140 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
144 define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
145 ; CHECK-LABEL: add_v8i16_v8i64_sext:
146 ; CHECK: // %bb.0: // %entry
147 ; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0
148 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
149 ; CHECK-NEXT: saddl2 v2.2d, v0.4s, v1.4s
150 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
151 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
152 ; CHECK-NEXT: addp d0, v0.2d
153 ; CHECK-NEXT: fmov x0, d0
156 %xx = sext <8 x i16> %x to <8 x i64>
157 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
161 define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
162 ; CHECK-LABEL: add_v4i16_v4i64_zext:
163 ; CHECK: // %bb.0: // %entry
164 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
165 ; CHECK-NEXT: uaddlv d0, v0.4s
166 ; CHECK-NEXT: fmov x0, d0
169 %xx = zext <4 x i16> %x to <4 x i64>
170 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
174 define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
175 ; CHECK-LABEL: add_v4i16_v4i64_sext:
176 ; CHECK: // %bb.0: // %entry
177 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
178 ; CHECK-NEXT: saddlv d0, v0.4s
179 ; CHECK-NEXT: fmov x0, d0
182 %xx = sext <4 x i16> %x to <4 x i64>
183 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
187 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
188 ; CHECK-LABEL: add_v2i16_v2i64_zext:
189 ; CHECK: // %bb.0: // %entry
190 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
191 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
192 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
193 ; CHECK-NEXT: addp d0, v0.2d
194 ; CHECK-NEXT: fmov x0, d0
197 %xx = zext <2 x i16> %x to <2 x i64>
198 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
202 define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
203 ; CHECK-LABEL: add_v2i16_v2i64_sext:
204 ; CHECK: // %bb.0: // %entry
205 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
206 ; CHECK-NEXT: shl v0.2d, v0.2d, #48
207 ; CHECK-NEXT: sshr v0.2d, v0.2d, #48
208 ; CHECK-NEXT: addp d0, v0.2d
209 ; CHECK-NEXT: fmov x0, d0
212 %xx = sext <2 x i16> %x to <2 x i64>
213 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
217 define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
218 ; CHECK-BASE-LABEL: add_v16i8_v16i32_zext:
219 ; CHECK-BASE: // %bb.0: // %entry
220 ; CHECK-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
221 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
222 ; CHECK-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
223 ; CHECK-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
224 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
225 ; CHECK-BASE-NEXT: addv s0, v0.4s
226 ; CHECK-BASE-NEXT: fmov w0, s0
227 ; CHECK-BASE-NEXT: ret
229 ; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
230 ; CHECK-DOT: // %bb.0: // %entry
231 ; CHECK-DOT-NEXT: movi v1.16b, #1
232 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
233 ; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
234 ; CHECK-DOT-NEXT: addv s0, v2.4s
235 ; CHECK-DOT-NEXT: fmov w0, s0
236 ; CHECK-DOT-NEXT: ret
238 %xx = zext <16 x i8> %x to <16 x i32>
239 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
243 define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
244 ; CHECK-BASE-LABEL: add_v16i8_v16i32_sext:
245 ; CHECK-BASE: // %bb.0: // %entry
246 ; CHECK-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
247 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
248 ; CHECK-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
249 ; CHECK-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
250 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
251 ; CHECK-BASE-NEXT: addv s0, v0.4s
252 ; CHECK-BASE-NEXT: fmov w0, s0
253 ; CHECK-BASE-NEXT: ret
255 ; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
256 ; CHECK-DOT: // %bb.0: // %entry
257 ; CHECK-DOT-NEXT: movi v1.16b, #1
258 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
259 ; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
260 ; CHECK-DOT-NEXT: addv s0, v2.4s
261 ; CHECK-DOT-NEXT: fmov w0, s0
262 ; CHECK-DOT-NEXT: ret
264 %xx = sext <16 x i8> %x to <16 x i32>
265 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
269 define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
270 ; CHECK-BASE-LABEL: add_v8i8_v8i32_zext:
271 ; CHECK-BASE: // %bb.0: // %entry
272 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
273 ; CHECK-BASE-NEXT: uaddlv s0, v0.8h
274 ; CHECK-BASE-NEXT: fmov w0, s0
275 ; CHECK-BASE-NEXT: ret
277 ; CHECK-DOT-LABEL: add_v8i8_v8i32_zext:
278 ; CHECK-DOT: // %bb.0: // %entry
279 ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
280 ; CHECK-DOT-NEXT: movi v2.8b, #1
281 ; CHECK-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
282 ; CHECK-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
283 ; CHECK-DOT-NEXT: fmov w0, s0
284 ; CHECK-DOT-NEXT: ret
286 %xx = zext <8 x i8> %x to <8 x i32>
287 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
291 define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
292 ; CHECK-BASE-LABEL: add_v8i8_v8i32_sext:
293 ; CHECK-BASE: // %bb.0: // %entry
294 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
295 ; CHECK-BASE-NEXT: saddlv s0, v0.8h
296 ; CHECK-BASE-NEXT: fmov w0, s0
297 ; CHECK-BASE-NEXT: ret
299 ; CHECK-DOT-LABEL: add_v8i8_v8i32_sext:
300 ; CHECK-DOT: // %bb.0: // %entry
301 ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
302 ; CHECK-DOT-NEXT: movi v2.8b, #1
303 ; CHECK-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
304 ; CHECK-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
305 ; CHECK-DOT-NEXT: fmov w0, s0
306 ; CHECK-DOT-NEXT: ret
308 %xx = sext <8 x i8> %x to <8 x i32>
309 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
313 define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
314 ; CHECK-LABEL: add_v4i8_v4i32_zext:
315 ; CHECK: // %bb.0: // %entry
316 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
317 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
318 ; CHECK-NEXT: addv s0, v0.4s
319 ; CHECK-NEXT: fmov w0, s0
322 %xx = zext <4 x i8> %x to <4 x i32>
323 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
327 define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
328 ; CHECK-LABEL: add_v4i8_v4i32_sext:
329 ; CHECK: // %bb.0: // %entry
330 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
331 ; CHECK-NEXT: shl v0.4s, v0.4s, #24
332 ; CHECK-NEXT: sshr v0.4s, v0.4s, #24
333 ; CHECK-NEXT: addv s0, v0.4s
334 ; CHECK-NEXT: fmov w0, s0
337 %xx = sext <4 x i8> %x to <4 x i32>
338 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
342 define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
343 ; CHECK-LABEL: add_v16i8_v16i16_zext:
344 ; CHECK: // %bb.0: // %entry
345 ; CHECK-NEXT: uaddlp v0.8h, v0.16b
346 ; CHECK-NEXT: addv h0, v0.8h
347 ; CHECK-NEXT: fmov w0, s0
350 %xx = zext <16 x i8> %x to <16 x i16>
351 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
355 define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
356 ; CHECK-LABEL: add_v16i8_v16i16_sext:
357 ; CHECK: // %bb.0: // %entry
358 ; CHECK-NEXT: saddlp v0.8h, v0.16b
359 ; CHECK-NEXT: addv h0, v0.8h
360 ; CHECK-NEXT: smov w0, v0.h[0]
363 %xx = sext <16 x i8> %x to <16 x i16>
364 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
368 define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
369 ; CHECK-LABEL: add_v8i8_v8i16_zext:
370 ; CHECK: // %bb.0: // %entry
371 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
372 ; CHECK-NEXT: addv h0, v0.8h
373 ; CHECK-NEXT: fmov w0, s0
376 %xx = zext <8 x i8> %x to <8 x i16>
377 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
381 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
382 ; CHECK-LABEL: add_v8i8_v8i16_sext:
383 ; CHECK: // %bb.0: // %entry
384 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
385 ; CHECK-NEXT: addv h0, v0.8h
386 ; CHECK-NEXT: smov w0, v0.h[0]
389 %xx = sext <8 x i8> %x to <8 x i16>
390 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
394 define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
395 ; CHECK-LABEL: add_v16i8_v16i8:
396 ; CHECK: // %bb.0: // %entry
397 ; CHECK-NEXT: addv b0, v0.16b
398 ; CHECK-NEXT: fmov w0, s0
401 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
405 define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
406 ; CHECK-LABEL: add_v16i8_v16i64_zext:
407 ; CHECK: // %bb.0: // %entry
408 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
409 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
410 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
411 ; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
412 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
413 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
414 ; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
415 ; CHECK-NEXT: uaddl v2.2d, v3.2s, v2.2s
416 ; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
417 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
418 ; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
419 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
420 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
421 ; CHECK-NEXT: addp d0, v0.2d
422 ; CHECK-NEXT: fmov x0, d0
425 %xx = zext <16 x i8> %x to <16 x i64>
426 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
430 define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
431 ; CHECK-LABEL: add_v16i8_v16i64_sext:
432 ; CHECK: // %bb.0: // %entry
433 ; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
434 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
435 ; CHECK-NEXT: sshll2 v2.4s, v1.8h, #0
436 ; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0
437 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
438 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
439 ; CHECK-NEXT: saddl2 v4.2d, v3.4s, v2.4s
440 ; CHECK-NEXT: saddl v2.2d, v3.2s, v2.2s
441 ; CHECK-NEXT: saddl2 v5.2d, v0.4s, v1.4s
442 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
443 ; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
444 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
445 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
446 ; CHECK-NEXT: addp d0, v0.2d
447 ; CHECK-NEXT: fmov x0, d0
450 %xx = sext <16 x i8> %x to <16 x i64>
451 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
455 define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
456 ; CHECK-LABEL: add_v8i8_v8i64_zext:
457 ; CHECK: // %bb.0: // %entry
458 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
459 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
460 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
461 ; CHECK-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
462 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
463 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
464 ; CHECK-NEXT: addp d0, v0.2d
465 ; CHECK-NEXT: fmov x0, d0
468 %xx = zext <8 x i8> %x to <8 x i64>
469 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
473 define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
474 ; CHECK-LABEL: add_v8i8_v8i64_sext:
475 ; CHECK: // %bb.0: // %entry
476 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
477 ; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0
478 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
479 ; CHECK-NEXT: saddl2 v2.2d, v0.4s, v1.4s
480 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
481 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
482 ; CHECK-NEXT: addp d0, v0.2d
483 ; CHECK-NEXT: fmov x0, d0
486 %xx = sext <8 x i8> %x to <8 x i64>
487 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
491 define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
492 ; CHECK-LABEL: add_v4i8_v4i64_zext:
493 ; CHECK: // %bb.0: // %entry
494 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
495 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
496 ; CHECK-NEXT: uaddlv d0, v0.4s
497 ; CHECK-NEXT: fmov x0, d0
500 %xx = zext <4 x i8> %x to <4 x i64>
501 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
505 define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
506 ; CHECK-LABEL: add_v4i8_v4i64_sext:
507 ; CHECK: // %bb.0: // %entry
508 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
509 ; CHECK-NEXT: ushll v1.2d, v0.2s, #0
510 ; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
511 ; CHECK-NEXT: shl v1.2d, v1.2d, #56
512 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
513 ; CHECK-NEXT: sshr v1.2d, v1.2d, #56
514 ; CHECK-NEXT: ssra v1.2d, v0.2d, #56
515 ; CHECK-NEXT: addp d0, v1.2d
516 ; CHECK-NEXT: fmov x0, d0
519 %xx = sext <4 x i8> %x to <4 x i64>
520 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
524 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
525 ; CHECK-LABEL: add_v2i8_v2i64_zext:
526 ; CHECK: // %bb.0: // %entry
527 ; CHECK-NEXT: movi d1, #0x0000ff000000ff
528 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
529 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
530 ; CHECK-NEXT: addp d0, v0.2d
531 ; CHECK-NEXT: fmov x0, d0
534 %xx = zext <2 x i8> %x to <2 x i64>
535 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
539 define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
540 ; CHECK-LABEL: add_v2i8_v2i64_sext:
541 ; CHECK: // %bb.0: // %entry
542 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
543 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
544 ; CHECK-NEXT: sshr v0.2d, v0.2d, #56
545 ; CHECK-NEXT: addp d0, v0.2d
546 ; CHECK-NEXT: fmov x0, d0
549 %xx = sext <2 x i8> %x to <2 x i64>
550 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
554 define i64 @add_v2i64_v2i64(<2 x i64> %x) {
555 ; CHECK-LABEL: add_v2i64_v2i64:
556 ; CHECK: // %bb.0: // %entry
557 ; CHECK-NEXT: addp d0, v0.2d
558 ; CHECK-NEXT: fmov x0, d0
561 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
565 define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
566 ; CHECK-LABEL: add_v4i32_v4i32_acc:
567 ; CHECK: // %bb.0: // %entry
568 ; CHECK-NEXT: addv s0, v0.4s
569 ; CHECK-NEXT: fmov w8, s0
570 ; CHECK-NEXT: add w0, w8, w0
573 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
578 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
579 ; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
580 ; CHECK: // %bb.0: // %entry
581 ; CHECK-NEXT: uaddlv d0, v0.4s
582 ; CHECK-NEXT: fmov x8, d0
583 ; CHECK-NEXT: add x0, x8, x0
586 %xx = zext <4 x i32> %x to <4 x i64>
587 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
592 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
593 ; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
594 ; CHECK: // %bb.0: // %entry
595 ; CHECK-NEXT: saddlv d0, v0.4s
596 ; CHECK-NEXT: fmov x8, d0
597 ; CHECK-NEXT: add x0, x8, x0
600 %xx = sext <4 x i32> %x to <4 x i64>
601 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
606 define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
607 ; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
608 ; CHECK: // %bb.0: // %entry
609 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
610 ; CHECK-NEXT: addp d0, v0.2d
611 ; CHECK-NEXT: fmov x8, d0
612 ; CHECK-NEXT: add x0, x8, x0
615 %xx = zext <2 x i32> %x to <2 x i64>
616 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
621 define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
622 ; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
623 ; CHECK: // %bb.0: // %entry
624 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
625 ; CHECK-NEXT: addp d0, v0.2d
626 ; CHECK-NEXT: fmov x8, d0
627 ; CHECK-NEXT: add x0, x8, x0
630 %xx = sext <2 x i32> %x to <2 x i64>
631 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
636 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
637 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
638 ; CHECK: // %bb.0: // %entry
639 ; CHECK-NEXT: uaddlv s0, v0.8h
640 ; CHECK-NEXT: fmov w8, s0
641 ; CHECK-NEXT: add w0, w8, w0
644 %xx = zext <8 x i16> %x to <8 x i32>
645 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
650 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
651 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
652 ; CHECK: // %bb.0: // %entry
653 ; CHECK-NEXT: saddlv s0, v0.8h
654 ; CHECK-NEXT: fmov w8, s0
655 ; CHECK-NEXT: add w0, w8, w0
658 %xx = sext <8 x i16> %x to <8 x i32>
659 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
664 define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
665 ; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
666 ; CHECK: // %bb.0: // %entry
667 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
668 ; CHECK-NEXT: addv s0, v0.4s
669 ; CHECK-NEXT: fmov w8, s0
670 ; CHECK-NEXT: add w0, w8, w0
673 %xx = zext <4 x i16> %x to <4 x i32>
674 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
679 define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
680 ; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
681 ; CHECK: // %bb.0: // %entry
682 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
683 ; CHECK-NEXT: addv s0, v0.4s
684 ; CHECK-NEXT: fmov w8, s0
685 ; CHECK-NEXT: add w0, w8, w0
688 %xx = sext <4 x i16> %x to <4 x i32>
689 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
694 define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
695 ; CHECK-LABEL: add_v8i16_v8i16_acc:
696 ; CHECK: // %bb.0: // %entry
697 ; CHECK-NEXT: addv h0, v0.8h
698 ; CHECK-NEXT: fmov w8, s0
699 ; CHECK-NEXT: add w8, w8, w0
700 ; CHECK-NEXT: and w0, w8, #0xffff
703 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
708 define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
709 ; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
710 ; CHECK: // %bb.0: // %entry
711 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
712 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
713 ; CHECK-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
714 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
715 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
716 ; CHECK-NEXT: addp d0, v0.2d
717 ; CHECK-NEXT: fmov x8, d0
718 ; CHECK-NEXT: add x0, x8, x0
721 %xx = zext <8 x i16> %x to <8 x i64>
722 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
727 define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
728 ; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
729 ; CHECK: // %bb.0: // %entry
730 ; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0
731 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
732 ; CHECK-NEXT: saddl2 v2.2d, v0.4s, v1.4s
733 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
734 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
735 ; CHECK-NEXT: addp d0, v0.2d
736 ; CHECK-NEXT: fmov x8, d0
737 ; CHECK-NEXT: add x0, x8, x0
740 %xx = sext <8 x i16> %x to <8 x i64>
741 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
746 define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
747 ; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
748 ; CHECK: // %bb.0: // %entry
749 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
750 ; CHECK-NEXT: uaddlv d0, v0.4s
751 ; CHECK-NEXT: fmov x8, d0
752 ; CHECK-NEXT: add x0, x8, x0
755 %xx = zext <4 x i16> %x to <4 x i64>
756 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
761 define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
762 ; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
763 ; CHECK: // %bb.0: // %entry
764 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
765 ; CHECK-NEXT: saddlv d0, v0.4s
766 ; CHECK-NEXT: fmov x8, d0
767 ; CHECK-NEXT: add x0, x8, x0
770 %xx = sext <4 x i16> %x to <4 x i64>
771 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
776 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
777 ; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
778 ; CHECK: // %bb.0: // %entry
779 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
780 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
781 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
782 ; CHECK-NEXT: addp d0, v0.2d
783 ; CHECK-NEXT: fmov x8, d0
784 ; CHECK-NEXT: add x0, x8, x0
787 %xx = zext <2 x i16> %x to <2 x i64>
788 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
793 define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
794 ; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
795 ; CHECK: // %bb.0: // %entry
796 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
797 ; CHECK-NEXT: shl v0.2d, v0.2d, #48
798 ; CHECK-NEXT: sshr v0.2d, v0.2d, #48
799 ; CHECK-NEXT: addp d0, v0.2d
800 ; CHECK-NEXT: fmov x8, d0
801 ; CHECK-NEXT: add x0, x8, x0
804 %xx = sext <2 x i16> %x to <2 x i64>
805 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
810 define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
811 ; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_zext:
812 ; CHECK-BASE: // %bb.0: // %entry
813 ; CHECK-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
814 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
815 ; CHECK-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
816 ; CHECK-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
817 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
818 ; CHECK-BASE-NEXT: addv s0, v0.4s
819 ; CHECK-BASE-NEXT: fmov w8, s0
820 ; CHECK-BASE-NEXT: add w0, w8, w0
821 ; CHECK-BASE-NEXT: ret
823 ; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
824 ; CHECK-DOT: // %bb.0: // %entry
825 ; CHECK-DOT-NEXT: movi v1.16b, #1
826 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
827 ; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
828 ; CHECK-DOT-NEXT: addv s0, v2.4s
829 ; CHECK-DOT-NEXT: fmov w8, s0
830 ; CHECK-DOT-NEXT: add w0, w8, w0
831 ; CHECK-DOT-NEXT: ret
833 %xx = zext <16 x i8> %x to <16 x i32>
834 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
839 define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
840 ; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_sext:
841 ; CHECK-BASE: // %bb.0: // %entry
842 ; CHECK-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
843 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
844 ; CHECK-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
845 ; CHECK-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
846 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
847 ; CHECK-BASE-NEXT: addv s0, v0.4s
848 ; CHECK-BASE-NEXT: fmov w8, s0
849 ; CHECK-BASE-NEXT: add w0, w8, w0
850 ; CHECK-BASE-NEXT: ret
852 ; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
853 ; CHECK-DOT: // %bb.0: // %entry
854 ; CHECK-DOT-NEXT: movi v1.16b, #1
855 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
856 ; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
857 ; CHECK-DOT-NEXT: addv s0, v2.4s
858 ; CHECK-DOT-NEXT: fmov w8, s0
859 ; CHECK-DOT-NEXT: add w0, w8, w0
860 ; CHECK-DOT-NEXT: ret
862 %xx = sext <16 x i8> %x to <16 x i32>
863 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
868 define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
869 ; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext:
870 ; CHECK-BASE: // %bb.0: // %entry
871 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
872 ; CHECK-BASE-NEXT: uaddlv s0, v0.8h
873 ; CHECK-BASE-NEXT: fmov w8, s0
874 ; CHECK-BASE-NEXT: add w0, w8, w0
875 ; CHECK-BASE-NEXT: ret
877 ; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_zext:
878 ; CHECK-DOT: // %bb.0: // %entry
879 ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
880 ; CHECK-DOT-NEXT: movi v2.8b, #1
881 ; CHECK-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
882 ; CHECK-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
883 ; CHECK-DOT-NEXT: fmov w8, s0
884 ; CHECK-DOT-NEXT: add w0, w8, w0
885 ; CHECK-DOT-NEXT: ret
887 %xx = zext <8 x i8> %x to <8 x i32>
888 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
893 define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
894 ; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext:
895 ; CHECK-BASE: // %bb.0: // %entry
896 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
897 ; CHECK-BASE-NEXT: saddlv s0, v0.8h
898 ; CHECK-BASE-NEXT: fmov w8, s0
899 ; CHECK-BASE-NEXT: add w0, w8, w0
900 ; CHECK-BASE-NEXT: ret
902 ; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_sext:
903 ; CHECK-DOT: // %bb.0: // %entry
904 ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
905 ; CHECK-DOT-NEXT: movi v2.8b, #1
906 ; CHECK-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
907 ; CHECK-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
908 ; CHECK-DOT-NEXT: fmov w8, s0
909 ; CHECK-DOT-NEXT: add w0, w8, w0
910 ; CHECK-DOT-NEXT: ret
912 %xx = sext <8 x i8> %x to <8 x i32>
913 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
918 define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
919 ; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
920 ; CHECK: // %bb.0: // %entry
921 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
922 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
923 ; CHECK-NEXT: addv s0, v0.4s
924 ; CHECK-NEXT: fmov w8, s0
925 ; CHECK-NEXT: add w0, w8, w0
928 %xx = zext <4 x i8> %x to <4 x i32>
929 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
934 define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
935 ; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
936 ; CHECK: // %bb.0: // %entry
937 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
938 ; CHECK-NEXT: shl v0.4s, v0.4s, #24
939 ; CHECK-NEXT: sshr v0.4s, v0.4s, #24
940 ; CHECK-NEXT: addv s0, v0.4s
941 ; CHECK-NEXT: fmov w8, s0
942 ; CHECK-NEXT: add w0, w8, w0
945 %xx = sext <4 x i8> %x to <4 x i32>
946 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
951 define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
952 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
953 ; CHECK: // %bb.0: // %entry
954 ; CHECK-NEXT: uaddlv h0, v0.16b
955 ; CHECK-NEXT: fmov w8, s0
956 ; CHECK-NEXT: add w8, w8, w0
957 ; CHECK-NEXT: and w0, w8, #0xffff
960 %xx = zext <16 x i8> %x to <16 x i16>
961 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
966 define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
967 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
968 ; CHECK: // %bb.0: // %entry
969 ; CHECK-NEXT: saddlv h0, v0.16b
970 ; CHECK-NEXT: fmov w8, s0
971 ; CHECK-NEXT: add w8, w8, w0
972 ; CHECK-NEXT: sxth w0, w8
975 %xx = sext <16 x i8> %x to <16 x i16>
976 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
981 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
982 ; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
983 ; CHECK: // %bb.0: // %entry
984 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
985 ; CHECK-NEXT: addv h0, v0.8h
986 ; CHECK-NEXT: fmov w8, s0
987 ; CHECK-NEXT: add w8, w8, w0
988 ; CHECK-NEXT: and w0, w8, #0xffff
991 %xx = zext <8 x i8> %x to <8 x i16>
992 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
997 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
998 ; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
999 ; CHECK: // %bb.0: // %entry
1000 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1001 ; CHECK-NEXT: addv h0, v0.8h
1002 ; CHECK-NEXT: fmov w8, s0
1003 ; CHECK-NEXT: add w8, w8, w0
1004 ; CHECK-NEXT: sxth w0, w8
1007 %xx = sext <8 x i8> %x to <8 x i16>
1008 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1013 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
1014 ; CHECK-LABEL: add_v16i8_v16i8_acc:
1015 ; CHECK: // %bb.0: // %entry
1016 ; CHECK-NEXT: addv b0, v0.16b
1017 ; CHECK-NEXT: fmov w8, s0
1018 ; CHECK-NEXT: add w8, w8, w0
1019 ; CHECK-NEXT: and w0, w8, #0xff
1022 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
1027 define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
1028 ; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1029 ; CHECK: // %bb.0: // %entry
1030 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
1031 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1032 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
1033 ; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
1034 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1035 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1036 ; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
1037 ; CHECK-NEXT: uaddl v2.2d, v3.2s, v2.2s
1038 ; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
1039 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1040 ; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
1041 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1042 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1043 ; CHECK-NEXT: addp d0, v0.2d
1044 ; CHECK-NEXT: fmov x8, d0
1045 ; CHECK-NEXT: add x0, x8, x0
1048 %xx = zext <16 x i8> %x to <16 x i64>
1049 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1054 define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
1055 ; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1056 ; CHECK: // %bb.0: // %entry
1057 ; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
1058 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1059 ; CHECK-NEXT: sshll2 v2.4s, v1.8h, #0
1060 ; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0
1061 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
1062 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1063 ; CHECK-NEXT: saddl2 v4.2d, v3.4s, v2.4s
1064 ; CHECK-NEXT: saddl v2.2d, v3.2s, v2.2s
1065 ; CHECK-NEXT: saddl2 v5.2d, v0.4s, v1.4s
1066 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
1067 ; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
1068 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1069 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1070 ; CHECK-NEXT: addp d0, v0.2d
1071 ; CHECK-NEXT: fmov x8, d0
1072 ; CHECK-NEXT: add x0, x8, x0
1075 %xx = sext <16 x i8> %x to <16 x i64>
1076 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1081 define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
1082 ; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
1083 ; CHECK: // %bb.0: // %entry
1084 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1085 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
1086 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1087 ; CHECK-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
1088 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1089 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1090 ; CHECK-NEXT: addp d0, v0.2d
1091 ; CHECK-NEXT: fmov x8, d0
1092 ; CHECK-NEXT: add x0, x8, x0
1095 %xx = zext <8 x i8> %x to <8 x i64>
1096 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1101 define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
1102 ; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
1103 ; CHECK: // %bb.0: // %entry
1104 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1105 ; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0
1106 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1107 ; CHECK-NEXT: saddl2 v2.2d, v0.4s, v1.4s
1108 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
1109 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1110 ; CHECK-NEXT: addp d0, v0.2d
1111 ; CHECK-NEXT: fmov x8, d0
1112 ; CHECK-NEXT: add x0, x8, x0
1115 %xx = sext <8 x i8> %x to <8 x i64>
1116 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1121 define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
1122 ; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
1123 ; CHECK: // %bb.0: // %entry
1124 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
1125 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1126 ; CHECK-NEXT: uaddlv d0, v0.4s
1127 ; CHECK-NEXT: fmov x8, d0
1128 ; CHECK-NEXT: add x0, x8, x0
1131 %xx = zext <4 x i8> %x to <4 x i64>
1132 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1137 define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
1138 ; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
1139 ; CHECK: // %bb.0: // %entry
1140 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1141 ; CHECK-NEXT: ushll v1.2d, v0.2s, #0
1142 ; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
1143 ; CHECK-NEXT: shl v1.2d, v1.2d, #56
1144 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
1145 ; CHECK-NEXT: sshr v1.2d, v1.2d, #56
1146 ; CHECK-NEXT: ssra v1.2d, v0.2d, #56
1147 ; CHECK-NEXT: addp d0, v1.2d
1148 ; CHECK-NEXT: fmov x8, d0
1149 ; CHECK-NEXT: add x0, x8, x0
1152 %xx = sext <4 x i8> %x to <4 x i64>
1153 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1158 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
1159 ; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1160 ; CHECK: // %bb.0: // %entry
1161 ; CHECK-NEXT: movi d1, #0x0000ff000000ff
1162 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
1163 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1164 ; CHECK-NEXT: addp d0, v0.2d
1165 ; CHECK-NEXT: fmov x8, d0
1166 ; CHECK-NEXT: add x0, x8, x0
1169 %xx = zext <2 x i8> %x to <2 x i64>
1170 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1175 define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
1176 ; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1177 ; CHECK: // %bb.0: // %entry
1178 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1179 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
1180 ; CHECK-NEXT: sshr v0.2d, v0.2d, #56
1181 ; CHECK-NEXT: addp d0, v0.2d
1182 ; CHECK-NEXT: fmov x8, d0
1183 ; CHECK-NEXT: add x0, x8, x0
1186 %xx = sext <2 x i8> %x to <2 x i64>
1187 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1192 define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
1193 ; CHECK-LABEL: add_v2i64_v2i64_acc:
1194 ; CHECK: // %bb.0: // %entry
1195 ; CHECK-NEXT: addp d0, v0.2d
1196 ; CHECK-NEXT: fmov x8, d0
1197 ; CHECK-NEXT: add x0, x8, x0
1200 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
1205 define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
1206 ; CHECK-LABEL: add_pair_v4i32_v4i32:
1207 ; CHECK: // %bb.0: // %entry
1208 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
1209 ; CHECK-NEXT: addv s0, v0.4s
1210 ; CHECK-NEXT: fmov w0, s0
1213 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
1214 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
1215 %z = add i32 %z1, %z2
1219 define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
1220 ; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
1221 ; CHECK: // %bb.0: // %entry
1222 ; CHECK-NEXT: uaddlp v1.2d, v1.4s
1223 ; CHECK-NEXT: uadalp v1.2d, v0.4s
1224 ; CHECK-NEXT: addp d0, v1.2d
1225 ; CHECK-NEXT: fmov x0, d0
1228 %xx = zext <4 x i32> %x to <4 x i64>
1229 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1230 %yy = zext <4 x i32> %y to <4 x i64>
1231 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1232 %z = add i64 %z1, %z2
1236 define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
1237 ; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
1238 ; CHECK: // %bb.0: // %entry
1239 ; CHECK-NEXT: saddlp v1.2d, v1.4s
1240 ; CHECK-NEXT: sadalp v1.2d, v0.4s
1241 ; CHECK-NEXT: addp d0, v1.2d
1242 ; CHECK-NEXT: fmov x0, d0
1245 %xx = sext <4 x i32> %x to <4 x i64>
1246 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1247 %yy = sext <4 x i32> %y to <4 x i64>
1248 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1249 %z = add i64 %z1, %z2
1253 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
1254 ; CHECK-LABEL: add_pair_v2i32_v2i64_zext:
1255 ; CHECK: // %bb.0: // %entry
1256 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1257 ; CHECK-NEXT: addp d0, v0.2d
1258 ; CHECK-NEXT: fmov x0, d0
1261 %xx = zext <2 x i32> %x to <2 x i64>
1262 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1263 %yy = zext <2 x i32> %y to <2 x i64>
1264 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1265 %z = add i64 %z1, %z2
1269 define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
1270 ; CHECK-LABEL: add_pair_v2i32_v2i64_sext:
1271 ; CHECK: // %bb.0: // %entry
1272 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
1273 ; CHECK-NEXT: addp d0, v0.2d
1274 ; CHECK-NEXT: fmov x0, d0
1277 %xx = sext <2 x i32> %x to <2 x i64>
1278 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1279 %yy = sext <2 x i32> %y to <2 x i64>
1280 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1281 %z = add i64 %z1, %z2
1285 define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
1286 ; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
1287 ; CHECK: // %bb.0: // %entry
1288 ; CHECK-NEXT: uaddlp v1.4s, v1.8h
1289 ; CHECK-NEXT: uadalp v1.4s, v0.8h
1290 ; CHECK-NEXT: addv s0, v1.4s
1291 ; CHECK-NEXT: fmov w0, s0
1294 %xx = zext <8 x i16> %x to <8 x i32>
1295 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1296 %yy = zext <8 x i16> %y to <8 x i32>
1297 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1298 %z = add i32 %z1, %z2
1302 define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
1303 ; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
1304 ; CHECK: // %bb.0: // %entry
1305 ; CHECK-NEXT: saddlp v1.4s, v1.8h
1306 ; CHECK-NEXT: sadalp v1.4s, v0.8h
1307 ; CHECK-NEXT: addv s0, v1.4s
1308 ; CHECK-NEXT: fmov w0, s0
1311 %xx = sext <8 x i16> %x to <8 x i32>
1312 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1313 %yy = sext <8 x i16> %y to <8 x i32>
1314 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1315 %z = add i32 %z1, %z2
1319 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
1320 ; CHECK-LABEL: add_pair_v4i16_v4i32_zext:
1321 ; CHECK: // %bb.0: // %entry
1322 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
1323 ; CHECK-NEXT: addv s0, v0.4s
1324 ; CHECK-NEXT: fmov w0, s0
1327 %xx = zext <4 x i16> %x to <4 x i32>
1328 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1329 %yy = zext <4 x i16> %y to <4 x i32>
1330 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1331 %z = add i32 %z1, %z2
1335 define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
1336 ; CHECK-LABEL: add_pair_v4i16_v4i32_sext:
1337 ; CHECK: // %bb.0: // %entry
1338 ; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
1339 ; CHECK-NEXT: addv s0, v0.4s
1340 ; CHECK-NEXT: fmov w0, s0
1343 %xx = sext <4 x i16> %x to <4 x i32>
1344 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1345 %yy = sext <4 x i16> %y to <4 x i32>
1346 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1347 %z = add i32 %z1, %z2
1351 define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
1352 ; CHECK-LABEL: add_pair_v8i16_v8i16:
1353 ; CHECK: // %bb.0: // %entry
1354 ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
1355 ; CHECK-NEXT: addv h0, v0.8h
1356 ; CHECK-NEXT: fmov w0, s0
1359 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
1360 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
1361 %z = add i16 %z1, %z2
1365 define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
1366 ; CHECK-LABEL: add_pair_v8i16_v8i64_zext:
1367 ; CHECK: // %bb.0: // %entry
1368 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
1369 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1370 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0
1371 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1372 ; CHECK-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
1373 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s
1374 ; CHECK-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
1375 ; CHECK-NEXT: uaddl v1.2d, v1.2s, v3.2s
1376 ; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
1377 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1378 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1379 ; CHECK-NEXT: addp d0, v0.2d
1380 ; CHECK-NEXT: fmov x0, d0
1383 %xx = zext <8 x i16> %x to <8 x i64>
1384 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1385 %yy = zext <8 x i16> %y to <8 x i64>
1386 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
1387 %z = add i64 %z1, %z2
1391 define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
1392 ; CHECK-LABEL: add_pair_v8i16_v8i64_sext:
1393 ; CHECK: // %bb.0: // %entry
1394 ; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0
1395 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1396 ; CHECK-NEXT: sshll2 v3.4s, v1.8h, #0
1397 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
1398 ; CHECK-NEXT: saddl2 v4.2d, v0.4s, v2.4s
1399 ; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s
1400 ; CHECK-NEXT: saddl2 v2.2d, v1.4s, v3.4s
1401 ; CHECK-NEXT: saddl v1.2d, v1.2s, v3.2s
1402 ; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
1403 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1404 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1405 ; CHECK-NEXT: addp d0, v0.2d
1406 ; CHECK-NEXT: fmov x0, d0
1409 %xx = sext <8 x i16> %x to <8 x i64>
1410 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1411 %yy = sext <8 x i16> %y to <8 x i64>
1412 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
1413 %z = add i64 %z1, %z2
1417 define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
1418 ; CHECK-LABEL: add_pair_v4i16_v4i64_zext:
1419 ; CHECK: // %bb.0: // %entry
1420 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1421 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1422 ; CHECK-NEXT: uaddlp v1.2d, v1.4s
1423 ; CHECK-NEXT: uadalp v1.2d, v0.4s
1424 ; CHECK-NEXT: addp d0, v1.2d
1425 ; CHECK-NEXT: fmov x0, d0
1428 %xx = zext <4 x i16> %x to <4 x i64>
1429 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1430 %yy = zext <4 x i16> %y to <4 x i64>
1431 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1432 %z = add i64 %z1, %z2
1436 define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
1437 ; CHECK-LABEL: add_pair_v4i16_v4i64_sext:
1438 ; CHECK: // %bb.0: // %entry
1439 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
1440 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1441 ; CHECK-NEXT: saddlp v1.2d, v1.4s
1442 ; CHECK-NEXT: sadalp v1.2d, v0.4s
1443 ; CHECK-NEXT: addp d0, v1.2d
1444 ; CHECK-NEXT: fmov x0, d0
1447 %xx = sext <4 x i16> %x to <4 x i64>
1448 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1449 %yy = sext <4 x i16> %y to <4 x i64>
1450 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1451 %z = add i64 %z1, %z2
1455 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
1456 ; CHECK-LABEL: add_pair_v2i16_v2i64_zext:
1457 ; CHECK: // %bb.0: // %entry
1458 ; CHECK-NEXT: movi d2, #0x00ffff0000ffff
1459 ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
1460 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
1461 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1462 ; CHECK-NEXT: addp d0, v0.2d
1463 ; CHECK-NEXT: fmov x0, d0
1466 %xx = zext <2 x i16> %x to <2 x i64>
1467 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1468 %yy = zext <2 x i16> %y to <2 x i64>
1469 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1470 %z = add i64 %z1, %z2
1474 define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
1475 ; CHECK-LABEL: add_pair_v2i16_v2i64_sext:
1476 ; CHECK: // %bb.0: // %entry
1477 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1478 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0
1479 ; CHECK-NEXT: shl v0.2d, v0.2d, #48
1480 ; CHECK-NEXT: shl v1.2d, v1.2d, #48
1481 ; CHECK-NEXT: sshr v0.2d, v0.2d, #48
1482 ; CHECK-NEXT: ssra v0.2d, v1.2d, #48
1483 ; CHECK-NEXT: addp d0, v0.2d
1484 ; CHECK-NEXT: fmov x0, d0
1487 %xx = sext <2 x i16> %x to <2 x i64>
1488 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1489 %yy = sext <2 x i16> %y to <2 x i64>
1490 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1491 %z = add i64 %z1, %z2
1495 define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
1496 ; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_zext:
1497 ; CHECK-BASE: // %bb.0: // %entry
1498 ; CHECK-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
1499 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
1500 ; CHECK-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
1501 ; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
1502 ; CHECK-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h
1503 ; CHECK-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
1504 ; CHECK-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h
1505 ; CHECK-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h
1506 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v4.4s
1507 ; CHECK-BASE-NEXT: add v1.4s, v1.4s, v2.4s
1508 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
1509 ; CHECK-BASE-NEXT: addv s0, v0.4s
1510 ; CHECK-BASE-NEXT: fmov w0, s0
1511 ; CHECK-BASE-NEXT: ret
1513 ; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_zext:
1514 ; CHECK-DOT: // %bb.0: // %entry
1515 ; CHECK-DOT-NEXT: movi v2.16b, #1
1516 ; CHECK-DOT-NEXT: movi v3.2d, #0000000000000000
1517 ; CHECK-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
1518 ; CHECK-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
1519 ; CHECK-DOT-NEXT: addv s0, v3.4s
1520 ; CHECK-DOT-NEXT: fmov w0, s0
1521 ; CHECK-DOT-NEXT: ret
1523 %xx = zext <16 x i8> %x to <16 x i32>
1524 %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
1525 %yy = zext <16 x i8> %y to <16 x i32>
1526 %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
1527 %z = add i32 %z1, %z2
1531 define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
1532 ; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_sext:
1533 ; CHECK-BASE: // %bb.0: // %entry
1534 ; CHECK-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
1535 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
1536 ; CHECK-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
1537 ; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
1538 ; CHECK-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h
1539 ; CHECK-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
1540 ; CHECK-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h
1541 ; CHECK-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h
1542 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v4.4s
1543 ; CHECK-BASE-NEXT: add v1.4s, v1.4s, v2.4s
1544 ; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
1545 ; CHECK-BASE-NEXT: addv s0, v0.4s
1546 ; CHECK-BASE-NEXT: fmov w0, s0
1547 ; CHECK-BASE-NEXT: ret
1549 ; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_sext:
1550 ; CHECK-DOT: // %bb.0: // %entry
1551 ; CHECK-DOT-NEXT: movi v2.16b, #1
1552 ; CHECK-DOT-NEXT: movi v3.2d, #0000000000000000
1553 ; CHECK-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
1554 ; CHECK-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
1555 ; CHECK-DOT-NEXT: addv s0, v3.4s
1556 ; CHECK-DOT-NEXT: fmov w0, s0
1557 ; CHECK-DOT-NEXT: ret
1559 %xx = sext <16 x i8> %x to <16 x i32>
1560 %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
1561 %yy = sext <16 x i8> %y to <16 x i32>
1562 %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
1563 %z = add i32 %z1, %z2
1567 define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
1568 ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_zext:
1569 ; CHECK-BASE: // %bb.0: // %entry
1570 ; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
1571 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
1572 ; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h
1573 ; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h
1574 ; CHECK-BASE-NEXT: addv s0, v1.4s
1575 ; CHECK-BASE-NEXT: fmov w0, s0
1576 ; CHECK-BASE-NEXT: ret
1578 ; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_zext:
1579 ; CHECK-DOT: // %bb.0: // %entry
1580 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
1581 ; CHECK-DOT-NEXT: movi v3.8b, #1
1582 ; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
1583 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
1584 ; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
1585 ; CHECK-DOT-NEXT: fmov w0, s0
1586 ; CHECK-DOT-NEXT: ret
1588 %xx = zext <8 x i8> %x to <8 x i32>
1589 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1590 %yy = zext <8 x i8> %y to <8 x i32>
1591 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1592 %z = add i32 %z1, %z2
1596 define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
1597 ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_sext:
1598 ; CHECK-BASE: // %bb.0: // %entry
1599 ; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
1600 ; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
1601 ; CHECK-BASE-NEXT: saddlp v1.4s, v1.8h
1602 ; CHECK-BASE-NEXT: sadalp v1.4s, v0.8h
1603 ; CHECK-BASE-NEXT: addv s0, v1.4s
1604 ; CHECK-BASE-NEXT: fmov w0, s0
1605 ; CHECK-BASE-NEXT: ret
1607 ; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_sext:
1608 ; CHECK-DOT: // %bb.0: // %entry
1609 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
1610 ; CHECK-DOT-NEXT: movi v3.8b, #1
1611 ; CHECK-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b
1612 ; CHECK-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b
1613 ; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
1614 ; CHECK-DOT-NEXT: fmov w0, s0
1615 ; CHECK-DOT-NEXT: ret
1617 %xx = sext <8 x i8> %x to <8 x i32>
1618 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1619 %yy = sext <8 x i8> %y to <8 x i32>
1620 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1621 %z = add i32 %z1, %z2
1625 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
1626 ; CHECK-LABEL: add_pair_v4i8_v4i32_zext:
1627 ; CHECK: // %bb.0: // %entry
1628 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
1629 ; CHECK-NEXT: bic v1.4h, #255, lsl #8
1630 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
1631 ; CHECK-NEXT: addv s0, v0.4s
1632 ; CHECK-NEXT: fmov w0, s0
1635 %xx = zext <4 x i8> %x to <4 x i32>
1636 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1637 %yy = zext <4 x i8> %y to <4 x i32>
1638 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1639 %z = add i32 %z1, %z2
1643 define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
1644 ; CHECK-LABEL: add_pair_v4i8_v4i32_sext:
1645 ; CHECK: // %bb.0: // %entry
1646 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1647 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1648 ; CHECK-NEXT: shl v0.4s, v0.4s, #24
1649 ; CHECK-NEXT: shl v1.4s, v1.4s, #24
1650 ; CHECK-NEXT: sshr v0.4s, v0.4s, #24
1651 ; CHECK-NEXT: ssra v0.4s, v1.4s, #24
1652 ; CHECK-NEXT: addv s0, v0.4s
1653 ; CHECK-NEXT: fmov w0, s0
1656 %xx = sext <4 x i8> %x to <4 x i32>
1657 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1658 %yy = sext <4 x i8> %y to <4 x i32>
1659 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1660 %z = add i32 %z1, %z2
1664 define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
1665 ; CHECK-LABEL: add_pair_v16i8_v16i16_zext:
1666 ; CHECK: // %bb.0: // %entry
1667 ; CHECK-NEXT: uaddlp v1.8h, v1.16b
1668 ; CHECK-NEXT: uadalp v1.8h, v0.16b
1669 ; CHECK-NEXT: addv h0, v1.8h
1670 ; CHECK-NEXT: fmov w0, s0
1673 %xx = zext <16 x i8> %x to <16 x i16>
1674 %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1675 %yy = zext <16 x i8> %y to <16 x i16>
1676 %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
1677 %z = add i16 %z1, %z2
1681 define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
1682 ; CHECK-LABEL: add_pair_v16i8_v16i16_sext:
1683 ; CHECK: // %bb.0: // %entry
1684 ; CHECK-NEXT: saddlp v1.8h, v1.16b
1685 ; CHECK-NEXT: sadalp v1.8h, v0.16b
1686 ; CHECK-NEXT: addv h0, v1.8h
1687 ; CHECK-NEXT: smov w0, v0.h[0]
1690 %xx = sext <16 x i8> %x to <16 x i16>
1691 %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1692 %yy = sext <16 x i8> %y to <16 x i16>
1693 %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
1694 %z = add i16 %z1, %z2
1698 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
1699 ; CHECK-LABEL: add_pair_v8i8_v8i16_zext:
1700 ; CHECK: // %bb.0: // %entry
1701 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
1702 ; CHECK-NEXT: addv h0, v0.8h
1703 ; CHECK-NEXT: fmov w0, s0
1706 %xx = zext <8 x i8> %x to <8 x i16>
1707 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1708 %yy = zext <8 x i8> %y to <8 x i16>
1709 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
1710 %z = add i16 %z1, %z2
1714 define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
1715 ; CHECK-LABEL: add_pair_v8i8_v8i16_sext:
1716 ; CHECK: // %bb.0: // %entry
1717 ; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
1718 ; CHECK-NEXT: addv h0, v0.8h
1719 ; CHECK-NEXT: smov w0, v0.h[0]
1722 %xx = sext <8 x i8> %x to <8 x i16>
1723 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1724 %yy = sext <8 x i8> %y to <8 x i16>
1725 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
1726 %z = add i16 %z1, %z2
1730 define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
1731 ; CHECK-LABEL: add_pair_v16i8_v16i8:
1732 ; CHECK: // %bb.0: // %entry
1733 ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
1734 ; CHECK-NEXT: addv b0, v0.16b
1735 ; CHECK-NEXT: fmov w0, s0
1738 %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
1739 %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
1740 %z = add i8 %z1, %z2
1744 define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
1745 ; CHECK-LABEL: add_pair_v16i8_v16i64_zext:
1746 ; CHECK: // %bb.0: // %entry
1747 ; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
1748 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1749 ; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
1750 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1751 ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
1752 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
1753 ; CHECK-NEXT: ushll2 v5.4s, v0.8h, #0
1754 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1755 ; CHECK-NEXT: ushll2 v6.4s, v3.8h, #0
1756 ; CHECK-NEXT: ushll2 v7.4s, v1.8h, #0
1757 ; CHECK-NEXT: ushll v3.4s, v3.4h, #0
1758 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1759 ; CHECK-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
1760 ; CHECK-NEXT: uaddl v2.2d, v5.2s, v2.2s
1761 ; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
1762 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v4.2s
1763 ; CHECK-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
1764 ; CHECK-NEXT: uaddl v6.2d, v7.2s, v6.2s
1765 ; CHECK-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
1766 ; CHECK-NEXT: uaddl v1.2d, v1.2s, v3.2s
1767 ; CHECK-NEXT: add v3.2d, v5.2d, v16.2d
1768 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1769 ; CHECK-NEXT: add v2.2d, v7.2d, v4.2d
1770 ; CHECK-NEXT: add v1.2d, v1.2d, v6.2d
1771 ; CHECK-NEXT: add v0.2d, v0.2d, v3.2d
1772 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1773 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1774 ; CHECK-NEXT: addp d0, v0.2d
1775 ; CHECK-NEXT: fmov x0, d0
1778 %xx = zext <16 x i8> %x to <16 x i64>
1779 %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1780 %yy = zext <16 x i8> %y to <16 x i64>
1781 %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
1782 %z = add i64 %z1, %z2
1786 define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
1787 ; CHECK-LABEL: add_pair_v16i8_v16i64_sext:
1788 ; CHECK: // %bb.0: // %entry
1789 ; CHECK-NEXT: sshll2 v2.8h, v0.16b, #0
1790 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1791 ; CHECK-NEXT: sshll2 v3.8h, v1.16b, #0
1792 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1793 ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
1794 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
1795 ; CHECK-NEXT: sshll2 v5.4s, v0.8h, #0
1796 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1797 ; CHECK-NEXT: sshll2 v6.4s, v3.8h, #0
1798 ; CHECK-NEXT: sshll2 v7.4s, v1.8h, #0
1799 ; CHECK-NEXT: sshll v3.4s, v3.4h, #0
1800 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
1801 ; CHECK-NEXT: saddl2 v16.2d, v5.4s, v2.4s
1802 ; CHECK-NEXT: saddl v2.2d, v5.2s, v2.2s
1803 ; CHECK-NEXT: saddl2 v5.2d, v0.4s, v4.4s
1804 ; CHECK-NEXT: saddl v0.2d, v0.2s, v4.2s
1805 ; CHECK-NEXT: saddl2 v4.2d, v7.4s, v6.4s
1806 ; CHECK-NEXT: saddl v6.2d, v7.2s, v6.2s
1807 ; CHECK-NEXT: saddl2 v7.2d, v1.4s, v3.4s
1808 ; CHECK-NEXT: saddl v1.2d, v1.2s, v3.2s
1809 ; CHECK-NEXT: add v3.2d, v5.2d, v16.2d
1810 ; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
1811 ; CHECK-NEXT: add v2.2d, v7.2d, v4.2d
1812 ; CHECK-NEXT: add v1.2d, v1.2d, v6.2d
1813 ; CHECK-NEXT: add v0.2d, v0.2d, v3.2d
1814 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1815 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1816 ; CHECK-NEXT: addp d0, v0.2d
1817 ; CHECK-NEXT: fmov x0, d0
1820 %xx = sext <16 x i8> %x to <16 x i64>
1821 %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1822 %yy = sext <16 x i8> %y to <16 x i64>
1823 %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
1824 %z = add i64 %z1, %z2
1828 define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
1829 ; CHECK-LABEL: add_pair_v8i8_v8i64_zext:
1830 ; CHECK: // %bb.0: // %entry
1831 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1832 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1833 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
1834 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1835 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0
1836 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1837 ; CHECK-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
1838 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s
1839 ; CHECK-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
1840 ; CHECK-NEXT: uaddl v1.2d, v1.2s, v3.2s
1841 ; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
1842 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1843 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1844 ; CHECK-NEXT: addp d0, v0.2d
1845 ; CHECK-NEXT: fmov x0, d0
1848 %xx = zext <8 x i8> %x to <8 x i64>
1849 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1850 %yy = zext <8 x i8> %y to <8 x i64>
1851 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
1852 %z = add i64 %z1, %z2
1856 define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
1857 ; CHECK-LABEL: add_pair_v8i8_v8i64_sext:
1858 ; CHECK: // %bb.0: // %entry
1859 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1860 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1861 ; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0
1862 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1863 ; CHECK-NEXT: sshll2 v3.4s, v1.8h, #0
1864 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
1865 ; CHECK-NEXT: saddl2 v4.2d, v0.4s, v2.4s
1866 ; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s
1867 ; CHECK-NEXT: saddl2 v2.2d, v1.4s, v3.4s
1868 ; CHECK-NEXT: saddl v1.2d, v1.2s, v3.2s
1869 ; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
1870 ; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
1871 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
1872 ; CHECK-NEXT: addp d0, v0.2d
1873 ; CHECK-NEXT: fmov x0, d0
1876 %xx = sext <8 x i8> %x to <8 x i64>
1877 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1878 %yy = sext <8 x i8> %y to <8 x i64>
1879 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
1880 %z = add i64 %z1, %z2
1884 define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
1885 ; CHECK-LABEL: add_pair_v4i8_v4i64_zext:
1886 ; CHECK: // %bb.0: // %entry
1887 ; CHECK-NEXT: bic v1.4h, #255, lsl #8
1888 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
1889 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1890 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1891 ; CHECK-NEXT: uaddlp v1.2d, v1.4s
1892 ; CHECK-NEXT: uadalp v1.2d, v0.4s
1893 ; CHECK-NEXT: addp d0, v1.2d
1894 ; CHECK-NEXT: fmov x0, d0
1897 %xx = zext <4 x i8> %x to <4 x i64>
1898 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1899 %yy = zext <4 x i8> %y to <4 x i64>
1900 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1901 %z = add i64 %z1, %z2
1905 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
1906 ; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
1907 ; CHECK: // %bb.0: // %entry
1908 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
1909 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1910 ; CHECK-NEXT: ushll v2.2d, v0.2s, #0
1911 ; CHECK-NEXT: ushll v3.2d, v1.2s, #0
1912 ; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
1913 ; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0
1914 ; CHECK-NEXT: shl v2.2d, v2.2d, #56
1915 ; CHECK-NEXT: shl v3.2d, v3.2d, #56
1916 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
1917 ; CHECK-NEXT: shl v1.2d, v1.2d, #56
1918 ; CHECK-NEXT: sshr v2.2d, v2.2d, #56
1919 ; CHECK-NEXT: sshr v3.2d, v3.2d, #56
1920 ; CHECK-NEXT: ssra v2.2d, v0.2d, #56
1921 ; CHECK-NEXT: ssra v3.2d, v1.2d, #56
1922 ; CHECK-NEXT: add v0.2d, v2.2d, v3.2d
1923 ; CHECK-NEXT: addp d0, v0.2d
1924 ; CHECK-NEXT: fmov x0, d0
1927 %xx = sext <4 x i8> %x to <4 x i64>
1928 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1929 %yy = sext <4 x i8> %y to <4 x i64>
1930 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1931 %z = add i64 %z1, %z2
1935 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
1936 ; CHECK-LABEL: add_pair_v2i8_v2i64_zext:
1937 ; CHECK: // %bb.0: // %entry
1938 ; CHECK-NEXT: movi d2, #0x0000ff000000ff
1939 ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
1940 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
1941 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1942 ; CHECK-NEXT: addp d0, v0.2d
1943 ; CHECK-NEXT: fmov x0, d0
1946 %xx = zext <2 x i8> %x to <2 x i64>
1947 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1948 %yy = zext <2 x i8> %y to <2 x i64>
1949 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1950 %z = add i64 %z1, %z2
1954 define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
1955 ; CHECK-LABEL: add_pair_v2i8_v2i64_sext:
1956 ; CHECK: // %bb.0: // %entry
1957 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
1958 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0
1959 ; CHECK-NEXT: shl v0.2d, v0.2d, #56
1960 ; CHECK-NEXT: shl v1.2d, v1.2d, #56
1961 ; CHECK-NEXT: sshr v0.2d, v0.2d, #56
1962 ; CHECK-NEXT: ssra v0.2d, v1.2d, #56
1963 ; CHECK-NEXT: addp d0, v0.2d
1964 ; CHECK-NEXT: fmov x0, d0
1967 %xx = sext <2 x i8> %x to <2 x i64>
1968 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1969 %yy = sext <2 x i8> %y to <2 x i64>
1970 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1971 %z = add i64 %z1, %z2
1975 define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
1976 ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
1977 ; CHECK-BASE: // %bb.0: // %entry
1978 ; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
1979 ; CHECK-BASE-NEXT: sshll v3.8h, v3.8b, #0
1980 ; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
1981 ; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0
1982 ; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h
1983 ; CHECK-BASE-NEXT: saddlp v3.4s, v3.8h
1984 ; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h
1985 ; CHECK-BASE-NEXT: sadalp v3.4s, v2.8h
1986 ; CHECK-BASE-NEXT: add v0.4s, v3.4s, v1.4s
1987 ; CHECK-BASE-NEXT: addv s0, v0.4s
1988 ; CHECK-BASE-NEXT: fmov w0, s0
1989 ; CHECK-BASE-NEXT: ret
1991 ; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
1992 ; CHECK-DOT: // %bb.0: // %entry
1993 ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
1994 ; CHECK-DOT-NEXT: movi v5.8b, #1
1995 ; CHECK-DOT-NEXT: movi v6.2d, #0000000000000000
1996 ; CHECK-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
1997 ; CHECK-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
1998 ; CHECK-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
1999 ; CHECK-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
2000 ; CHECK-DOT-NEXT: add v0.2s, v6.2s, v4.2s
2001 ; CHECK-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
2002 ; CHECK-DOT-NEXT: fmov w0, s0
2003 ; CHECK-DOT-NEXT: ret
2005 %axx = zext <8 x i8> %ax to <8 x i32>
2006 %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
2007 %ayy = zext <8 x i8> %ay to <8 x i32>
2008 %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
2009 %az = add i32 %az1, %az2
2010 %bxx = sext <8 x i8> %bx to <8 x i32>
2011 %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
2012 %byy = sext <8 x i8> %by to <8 x i32>
2013 %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
2014 %bz = add i32 %bz1, %bz2
2015 %z = add i32 %az, %bz
2019 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
2020 ; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
2021 ; CHECK: // %bb.0: // %entry
2022 ; CHECK-NEXT: uaddlp v1.4s, v1.8h
2023 ; CHECK-NEXT: uaddlp v3.4s, v3.8h
2024 ; CHECK-NEXT: uadalp v1.4s, v0.8h
2025 ; CHECK-NEXT: uadalp v3.4s, v2.8h
2026 ; CHECK-NEXT: add v0.4s, v3.4s, v1.4s
2027 ; CHECK-NEXT: addv s0, v0.4s
2028 ; CHECK-NEXT: fmov w0, s0
2031 %axx = zext <8 x i16> %ax to <8 x i32>
2032 %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2033 %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2034 %axs = add <4 x i32> %s1h, %s1l
2035 %ayy = zext <8 x i16> %ay to <8 x i32>
2036 %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2037 %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2038 %ays = add <4 x i32> %s2h, %s2l
2039 %az = add <4 x i32> %axs, %ays
2040 %bxx = zext <8 x i16> %bx to <8 x i32>
2041 %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2042 %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2043 %bxs = add <4 x i32> %s3h, %s3l
2044 %byy = zext <8 x i16> %by to <8 x i32>
2045 %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2046 %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2047 %bys = add <4 x i32> %s4h, %s4l
2048 %bz = add <4 x i32> %bxs, %bys
2049 %z = add <4 x i32> %az, %bz
2050 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
2054 define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
2055 ; CHECK-LABEL: add_pair_v2i64_v2i64:
2056 ; CHECK: // %bb.0: // %entry
2057 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
2058 ; CHECK-NEXT: addp d0, v0.2d
2059 ; CHECK-NEXT: fmov x0, d0
2062 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
2063 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
2064 %z = add i64 %z1, %z2
2068 define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
2069 ; CHECK-BASE-LABEL: full:
2070 ; CHECK-BASE: // %bb.0: // %entry
2071 ; CHECK-BASE-NEXT: ldr d0, [x2]
2072 ; CHECK-BASE-NEXT: ldr d1, [x0]
2073 ; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
2074 ; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
2075 ; CHECK-BASE-NEXT: sxtw x8, w3
2076 ; CHECK-BASE-NEXT: sxtw x9, w1
2077 ; CHECK-BASE-NEXT: uabdl v0.8h, v1.8b, v0.8b
2078 ; CHECK-BASE-NEXT: add x11, x2, x8
2079 ; CHECK-BASE-NEXT: add x10, x0, x9
2080 ; CHECK-BASE-NEXT: ldr d2, [x11]
2081 ; CHECK-BASE-NEXT: add x11, x11, x8
2082 ; CHECK-BASE-NEXT: ldr d1, [x10]
2083 ; CHECK-BASE-NEXT: add x10, x10, x9
2084 ; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
2085 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2086 ; CHECK-BASE-NEXT: ldr d2, [x11]
2087 ; CHECK-BASE-NEXT: add x11, x11, x8
2088 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2089 ; CHECK-BASE-NEXT: ldr d1, [x10]
2090 ; CHECK-BASE-NEXT: add x10, x10, x9
2091 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2092 ; CHECK-BASE-NEXT: ldr d2, [x11]
2093 ; CHECK-BASE-NEXT: add x11, x11, x8
2094 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2095 ; CHECK-BASE-NEXT: ldr d1, [x10]
2096 ; CHECK-BASE-NEXT: add x10, x10, x9
2097 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2098 ; CHECK-BASE-NEXT: ldr d2, [x11]
2099 ; CHECK-BASE-NEXT: add x11, x11, x8
2100 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2101 ; CHECK-BASE-NEXT: ldr d1, [x10]
2102 ; CHECK-BASE-NEXT: add x10, x10, x9
2103 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2104 ; CHECK-BASE-NEXT: ldr d2, [x11]
2105 ; CHECK-BASE-NEXT: add x11, x11, x8
2106 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2107 ; CHECK-BASE-NEXT: ldr d1, [x10]
2108 ; CHECK-BASE-NEXT: add x10, x10, x9
2109 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2110 ; CHECK-BASE-NEXT: ldr d2, [x11]
2111 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2112 ; CHECK-BASE-NEXT: ldr d1, [x10]
2113 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2114 ; CHECK-BASE-NEXT: ldr d2, [x11, x8]
2115 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2116 ; CHECK-BASE-NEXT: ldr d1, [x10, x9]
2117 ; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
2118 ; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
2119 ; CHECK-BASE-NEXT: addv s0, v0.4s
2120 ; CHECK-BASE-NEXT: fmov w0, s0
2121 ; CHECK-BASE-NEXT: ret
2123 ; CHECK-DOT-LABEL: full:
2124 ; CHECK-DOT: // %bb.0: // %entry
2125 ; CHECK-DOT-NEXT: ldr d0, [x0]
2126 ; CHECK-DOT-NEXT: ldr d1, [x2]
2127 ; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
2128 ; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
2129 ; CHECK-DOT-NEXT: sxtw x8, w3
2130 ; CHECK-DOT-NEXT: sxtw x9, w1
2131 ; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
2132 ; CHECK-DOT-NEXT: movi v3.8b, #1
2133 ; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
2134 ; CHECK-DOT-NEXT: add x11, x2, x8
2135 ; CHECK-DOT-NEXT: add x10, x0, x9
2136 ; CHECK-DOT-NEXT: ldr d4, [x11]
2137 ; CHECK-DOT-NEXT: add x11, x11, x8
2138 ; CHECK-DOT-NEXT: ldr d1, [x10]
2139 ; CHECK-DOT-NEXT: add x10, x10, x9
2140 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2141 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2142 ; CHECK-DOT-NEXT: ldr d1, [x10]
2143 ; CHECK-DOT-NEXT: ldr d4, [x11]
2144 ; CHECK-DOT-NEXT: add x10, x10, x9
2145 ; CHECK-DOT-NEXT: add x11, x11, x8
2146 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2147 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2148 ; CHECK-DOT-NEXT: ldr d1, [x10]
2149 ; CHECK-DOT-NEXT: ldr d4, [x11]
2150 ; CHECK-DOT-NEXT: add x10, x10, x9
2151 ; CHECK-DOT-NEXT: add x11, x11, x8
2152 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2153 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2154 ; CHECK-DOT-NEXT: ldr d1, [x10]
2155 ; CHECK-DOT-NEXT: ldr d4, [x11]
2156 ; CHECK-DOT-NEXT: add x10, x10, x9
2157 ; CHECK-DOT-NEXT: add x11, x11, x8
2158 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2159 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2160 ; CHECK-DOT-NEXT: ldr d1, [x10]
2161 ; CHECK-DOT-NEXT: ldr d4, [x11]
2162 ; CHECK-DOT-NEXT: add x10, x10, x9
2163 ; CHECK-DOT-NEXT: add x11, x11, x8
2164 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2165 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2166 ; CHECK-DOT-NEXT: ldr d1, [x10]
2167 ; CHECK-DOT-NEXT: ldr d4, [x11]
2168 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2169 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2170 ; CHECK-DOT-NEXT: ldr d1, [x10, x9]
2171 ; CHECK-DOT-NEXT: ldr d4, [x11, x8]
2172 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2173 ; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
2174 ; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
2175 ; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
2176 ; CHECK-DOT-NEXT: fmov w0, s0
2177 ; CHECK-DOT-NEXT: ret
2179 %idx.ext8 = sext i32 %s2 to i64
2180 %idx.ext = sext i32 %s1 to i64
2181 %0 = load <8 x i8>, ptr %p1, align 1
2182 %1 = zext <8 x i8> %0 to <8 x i32>
2183 %2 = load <8 x i8>, ptr %p2, align 1
2184 %3 = zext <8 x i8> %2 to <8 x i32>
2185 %4 = sub nsw <8 x i32> %1, %3
2186 %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
2187 %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
2188 %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
2189 %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
2190 %7 = load <8 x i8>, ptr %add.ptr, align 1
2191 %8 = zext <8 x i8> %7 to <8 x i32>
2192 %9 = load <8 x i8>, ptr %add.ptr9, align 1
2193 %10 = zext <8 x i8> %9 to <8 x i32>
2194 %11 = sub nsw <8 x i32> %8, %10
2195 %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
2196 %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
2197 %op.rdx.1 = add i32 %13, %6
2198 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
2199 %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
2200 %14 = load <8 x i8>, ptr %add.ptr.1, align 1
2201 %15 = zext <8 x i8> %14 to <8 x i32>
2202 %16 = load <8 x i8>, ptr %add.ptr9.1, align 1
2203 %17 = zext <8 x i8> %16 to <8 x i32>
2204 %18 = sub nsw <8 x i32> %15, %17
2205 %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
2206 %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
2207 %op.rdx.2 = add i32 %20, %op.rdx.1
2208 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
2209 %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
2210 %21 = load <8 x i8>, ptr %add.ptr.2, align 1
2211 %22 = zext <8 x i8> %21 to <8 x i32>
2212 %23 = load <8 x i8>, ptr %add.ptr9.2, align 1
2213 %24 = zext <8 x i8> %23 to <8 x i32>
2214 %25 = sub nsw <8 x i32> %22, %24
2215 %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
2216 %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
2217 %op.rdx.3 = add i32 %27, %op.rdx.2
2218 %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
2219 %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
2220 %28 = load <8 x i8>, ptr %add.ptr.3, align 1
2221 %29 = zext <8 x i8> %28 to <8 x i32>
2222 %30 = load <8 x i8>, ptr %add.ptr9.3, align 1
2223 %31 = zext <8 x i8> %30 to <8 x i32>
2224 %32 = sub nsw <8 x i32> %29, %31
2225 %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
2226 %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
2227 %op.rdx.4 = add i32 %34, %op.rdx.3
2228 %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
2229 %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
2230 %35 = load <8 x i8>, ptr %add.ptr.4, align 1
2231 %36 = zext <8 x i8> %35 to <8 x i32>
2232 %37 = load <8 x i8>, ptr %add.ptr9.4, align 1
2233 %38 = zext <8 x i8> %37 to <8 x i32>
2234 %39 = sub nsw <8 x i32> %36, %38
2235 %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
2236 %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
2237 %op.rdx.5 = add i32 %41, %op.rdx.4
2238 %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
2239 %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
2240 %42 = load <8 x i8>, ptr %add.ptr.5, align 1
2241 %43 = zext <8 x i8> %42 to <8 x i32>
2242 %44 = load <8 x i8>, ptr %add.ptr9.5, align 1
2243 %45 = zext <8 x i8> %44 to <8 x i32>
2244 %46 = sub nsw <8 x i32> %43, %45
2245 %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
2246 %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
2247 %op.rdx.6 = add i32 %48, %op.rdx.5
2248 %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
2249 %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
2250 %49 = load <8 x i8>, ptr %add.ptr.6, align 1
2251 %50 = zext <8 x i8> %49 to <8 x i32>
2252 %51 = load <8 x i8>, ptr %add.ptr9.6, align 1
2253 %52 = zext <8 x i8> %51 to <8 x i32>
2254 %53 = sub nsw <8 x i32> %50, %52
2255 %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
2256 %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
2257 %op.rdx.7 = add i32 %55, %op.rdx.6
2261 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
2262 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
2263 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
2264 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
2265 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2266 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
2267 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
2268 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
2269 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
2270 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
2271 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)