1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
4 define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 ; CHECK-LABEL: vpaddi8:
7 ; CHECK-NEXT: vldr d16, [r1]
8 ; CHECK-NEXT: vldr d17, [r0]
9 ; CHECK-NEXT: vpadd.i8 d16, d17, d16
10 ; CHECK-NEXT: vmov r0, r1, d16
11 ; CHECK-NEXT: mov pc, lr
12 %tmp1 = load <8 x i8>, <8 x i8>* %A
13 %tmp2 = load <8 x i8>, <8 x i8>* %B
14 %tmp3 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
18 define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
19 ; CHECK-LABEL: vpaddi16:
21 ; CHECK-NEXT: vldr d16, [r1]
22 ; CHECK-NEXT: vldr d17, [r0]
23 ; CHECK-NEXT: vpadd.i16 d16, d17, d16
24 ; CHECK-NEXT: vmov r0, r1, d16
25 ; CHECK-NEXT: mov pc, lr
26 %tmp1 = load <4 x i16>, <4 x i16>* %A
27 %tmp2 = load <4 x i16>, <4 x i16>* %B
28 %tmp3 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
32 define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
33 ; CHECK-LABEL: vpaddi32:
35 ; CHECK-NEXT: vldr d16, [r1]
36 ; CHECK-NEXT: vldr d17, [r0]
37 ; CHECK-NEXT: vpadd.i32 d16, d17, d16
38 ; CHECK-NEXT: vmov r0, r1, d16
39 ; CHECK-NEXT: mov pc, lr
40 %tmp1 = load <2 x i32>, <2 x i32>* %A
41 %tmp2 = load <2 x i32>, <2 x i32>* %B
42 %tmp3 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
46 define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
47 ; CHECK-LABEL: vpaddf32:
49 ; CHECK-NEXT: vldr d16, [r1]
50 ; CHECK-NEXT: vldr d17, [r0]
51 ; CHECK-NEXT: vpadd.f32 d16, d17, d16
52 ; CHECK-NEXT: vmov r0, r1, d16
53 ; CHECK-NEXT: mov pc, lr
54 %tmp1 = load <2 x float>, <2 x float>* %A
55 %tmp2 = load <2 x float>, <2 x float>* %B
56 %tmp3 = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
60 declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
61 declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
62 declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
64 declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
66 define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
67 ; CHECK-LABEL: vpaddls8:
69 ; CHECK-NEXT: vldr d16, [r0]
70 ; CHECK-NEXT: vpaddl.s8 d16, d16
71 ; CHECK-NEXT: vmov r0, r1, d16
72 ; CHECK-NEXT: mov pc, lr
73 %tmp1 = load <8 x i8>, <8 x i8>* %A
74 %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
78 define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
79 ; CHECK-LABEL: vpaddls16:
81 ; CHECK-NEXT: vldr d16, [r0]
82 ; CHECK-NEXT: vpaddl.s16 d16, d16
83 ; CHECK-NEXT: vmov r0, r1, d16
84 ; CHECK-NEXT: mov pc, lr
85 %tmp1 = load <4 x i16>, <4 x i16>* %A
86 %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
90 define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
91 ; CHECK-LABEL: vpaddls32:
93 ; CHECK-NEXT: vldr d16, [r0]
94 ; CHECK-NEXT: vpaddl.s32 d16, d16
95 ; CHECK-NEXT: vmov r0, r1, d16
96 ; CHECK-NEXT: mov pc, lr
97 %tmp1 = load <2 x i32>, <2 x i32>* %A
98 %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
102 define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
103 ; CHECK-LABEL: vpaddlu8:
105 ; CHECK-NEXT: vldr d16, [r0]
106 ; CHECK-NEXT: vpaddl.u8 d16, d16
107 ; CHECK-NEXT: vmov r0, r1, d16
108 ; CHECK-NEXT: mov pc, lr
109 %tmp1 = load <8 x i8>, <8 x i8>* %A
110 %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
114 define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
115 ; CHECK-LABEL: vpaddlu16:
117 ; CHECK-NEXT: vldr d16, [r0]
118 ; CHECK-NEXT: vpaddl.u16 d16, d16
119 ; CHECK-NEXT: vmov r0, r1, d16
120 ; CHECK-NEXT: mov pc, lr
121 %tmp1 = load <4 x i16>, <4 x i16>* %A
122 %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
126 define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
127 ; CHECK-LABEL: vpaddlu32:
129 ; CHECK-NEXT: vldr d16, [r0]
130 ; CHECK-NEXT: vpaddl.u32 d16, d16
131 ; CHECK-NEXT: vmov r0, r1, d16
132 ; CHECK-NEXT: mov pc, lr
133 %tmp1 = load <2 x i32>, <2 x i32>* %A
134 %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
138 define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
139 ; CHECK-LABEL: vpaddlQs8:
141 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
142 ; CHECK-NEXT: vpaddl.s8 q8, q8
143 ; CHECK-NEXT: vmov r0, r1, d16
144 ; CHECK-NEXT: vmov r2, r3, d17
145 ; CHECK-NEXT: mov pc, lr
146 %tmp1 = load <16 x i8>, <16 x i8>* %A
147 %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
151 define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
152 ; CHECK-LABEL: vpaddlQs16:
154 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
155 ; CHECK-NEXT: vpaddl.s16 q8, q8
156 ; CHECK-NEXT: vmov r0, r1, d16
157 ; CHECK-NEXT: vmov r2, r3, d17
158 ; CHECK-NEXT: mov pc, lr
159 %tmp1 = load <8 x i16>, <8 x i16>* %A
160 %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
164 define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
165 ; CHECK-LABEL: vpaddlQs32:
167 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
168 ; CHECK-NEXT: vpaddl.s32 q8, q8
169 ; CHECK-NEXT: vmov r0, r1, d16
170 ; CHECK-NEXT: vmov r2, r3, d17
171 ; CHECK-NEXT: mov pc, lr
172 %tmp1 = load <4 x i32>, <4 x i32>* %A
173 %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
177 define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
178 ; CHECK-LABEL: vpaddlQu8:
180 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
181 ; CHECK-NEXT: vpaddl.u8 q8, q8
182 ; CHECK-NEXT: vmov r0, r1, d16
183 ; CHECK-NEXT: vmov r2, r3, d17
184 ; CHECK-NEXT: mov pc, lr
185 %tmp1 = load <16 x i8>, <16 x i8>* %A
186 %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
190 define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
191 ; CHECK-LABEL: vpaddlQu16:
193 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
194 ; CHECK-NEXT: vpaddl.u16 q8, q8
195 ; CHECK-NEXT: vmov r0, r1, d16
196 ; CHECK-NEXT: vmov r2, r3, d17
197 ; CHECK-NEXT: mov pc, lr
198 %tmp1 = load <8 x i16>, <8 x i16>* %A
199 %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
203 define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
204 ; CHECK-LABEL: vpaddlQu32:
206 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
207 ; CHECK-NEXT: vpaddl.u32 q8, q8
208 ; CHECK-NEXT: vmov r0, r1, d16
209 ; CHECK-NEXT: vmov r2, r3, d17
210 ; CHECK-NEXT: mov pc, lr
211 %tmp1 = load <4 x i32>, <4 x i32>* %A
212 %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
216 ; Combine vuzp+vadd->vpadd.
217 define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
218 ; CHECK-LABEL: addCombineToVPADD_i8:
220 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
221 ; CHECK-NEXT: vpadd.i8 d16, d16, d17
222 ; CHECK-NEXT: vstr d16, [r1]
223 ; CHECK-NEXT: mov pc, lr
224 %tmp = load <16 x i8>, <16 x i8>* %cbcr
225 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
226 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
228 %add = add <8 x i8> %tmp3, %tmp1
229 store <8 x i8> %add, <8 x i8>* %X, align 8
233 ; Combine vuzp+vadd->vpadd.
234 define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
235 ; CHECK-LABEL: addCombineToVPADD_i16:
237 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
238 ; CHECK-NEXT: vpadd.i16 d16, d16, d17
239 ; CHECK-NEXT: vstr d16, [r1]
240 ; CHECK-NEXT: mov pc, lr
241 %tmp = load <8 x i16>, <8 x i16>* %cbcr
242 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
243 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
244 %add = add <4 x i16> %tmp3, %tmp1
245 store <4 x i16> %add, <4 x i16>* %X, align 8
249 ; Combine vtrn+vadd->vpadd.
250 define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
251 ; CHECK-LABEL: addCombineToVPADD_i32:
253 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
254 ; CHECK-NEXT: vpadd.i32 d16, d16, d17
255 ; CHECK-NEXT: vstr d16, [r1]
256 ; CHECK-NEXT: mov pc, lr
257 %tmp = load <4 x i32>, <4 x i32>* %cbcr
258 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
259 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
260 %add = add <2 x i32> %tmp3, %tmp1
261 store <2 x i32> %add, <2 x i32>* %X, align 8
265 ; Combine vuzp+vaddl->vpaddl
266 define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
267 ; CHECK-LABEL: addCombineToVPADDLq_s8:
269 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
270 ; CHECK-NEXT: vpaddl.s8 q8, q8
271 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
272 ; CHECK-NEXT: mov pc, lr
273 %tmp = load <16 x i8>, <16 x i8>* %cbcr
274 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
275 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
276 %tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
277 %tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
278 %add = add <8 x i16> %tmp4, %tmp5
279 store <8 x i16> %add, <8 x i16>* %X, align 8
283 ; Combine vuzp+vaddl->vpaddl
284 ; FIXME: Legalization butchers the shuffles.
285 define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
286 ; CHECK-LABEL: addCombineToVPADDL_s8:
288 ; CHECK-NEXT: vmov.i16 d16, #0x8
289 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
290 ; CHECK-NEXT: vext.8 d17, d18, d16, #1
291 ; CHECK-NEXT: vneg.s16 d16, d16
292 ; CHECK-NEXT: vshl.i16 d18, d18, #8
293 ; CHECK-NEXT: vshl.i16 d17, d17, #8
294 ; CHECK-NEXT: vshl.s16 d18, d18, d16
295 ; CHECK-NEXT: vshl.s16 d16, d17, d16
296 ; CHECK-NEXT: vadd.i16 d16, d16, d18
297 ; CHECK-NEXT: vstr d16, [r1]
298 ; CHECK-NEXT: mov pc, lr
299 %tmp = load <16 x i8>, <16 x i8>* %cbcr
300 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
301 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
302 %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
303 %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
304 %add = add <4 x i16> %tmp4, %tmp5
305 store <4 x i16> %add, <4 x i16>* %X, align 8
309 ; Combine vuzp+vaddl->vpaddl
310 define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
311 ; CHECK-LABEL: addCombineToVPADDLq_u8:
313 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
314 ; CHECK-NEXT: vpaddl.u8 q8, q8
315 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
316 ; CHECK-NEXT: mov pc, lr
317 %tmp = load <16 x i8>, <16 x i8>* %cbcr
318 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
319 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
320 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
321 %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
322 %add = add <8 x i16> %tmp4, %tmp5
323 store <8 x i16> %add, <8 x i16>* %X, align 8
327 ; In theory, it's possible to match this to vpaddl, but rearranging the
328 ; shuffle is awkward, so this doesn't match at the moment.
329 define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
330 ; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
332 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
333 ; CHECK-NEXT: vmovl.u8 q9, d17
334 ; CHECK-NEXT: vmovl.u8 q8, d16
335 ; CHECK-NEXT: vuzp.16 q8, q9
336 ; CHECK-NEXT: vadd.i16 q8, q8, q9
337 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
338 ; CHECK-NEXT: mov pc, lr
339 %tmp = load <16 x i8>, <16 x i8>* %cbcr
340 %tmp1 = zext <16 x i8> %tmp to <16 x i16>
341 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
342 %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
343 %add = add <8 x i16> %tmp2, %tmp3
344 store <8 x i16> %add, <8 x i16>* %X, align 8
348 ; Combine vuzp+vaddl->vpaddl
349 ; FIXME: Legalization butchers the shuffle.
350 define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
351 ; CHECK-LABEL: addCombineToVPADDL_u8:
353 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
354 ; CHECK-NEXT: vext.8 d18, d16, d16, #1
355 ; CHECK-NEXT: vbic.i16 d16, #0xff00
356 ; CHECK-NEXT: vbic.i16 d18, #0xff00
357 ; CHECK-NEXT: vadd.i16 d16, d18, d16
358 ; CHECK-NEXT: vstr d16, [r1]
359 ; CHECK-NEXT: mov pc, lr
360 %tmp = load <16 x i8>, <16 x i8>* %cbcr
361 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
362 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
363 %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
364 %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
365 %add = add <4 x i16> %tmp4, %tmp5
366 store <4 x i16> %add, <4 x i16>* %X, align 8
370 ; Matching to vpaddl.8 requires matching shuffle(zext()).
371 define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
372 ; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
374 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
375 ; CHECK-NEXT: vmovl.u8 q8, d16
376 ; CHECK-NEXT: vpadd.i16 d16, d16, d17
377 ; CHECK-NEXT: vstr d16, [r1]
378 ; CHECK-NEXT: mov pc, lr
379 %tmp = load <16 x i8>, <16 x i8>* %cbcr
380 %tmp1 = zext <16 x i8> %tmp to <16 x i16>
381 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
382 %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
383 %add = add <4 x i16> %tmp2, %tmp3
384 store <4 x i16> %add, <4 x i16>* %X, align 8
388 ; Combine vuzp+vaddl->vpaddl
389 define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
390 ; CHECK-LABEL: addCombineToVPADDLq_s16:
392 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
393 ; CHECK-NEXT: vpaddl.s16 q8, q8
394 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
395 ; CHECK-NEXT: mov pc, lr
396 %tmp = load <8 x i16>, <8 x i16>* %cbcr
397 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
398 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
399 %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
400 %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
401 %add = add <4 x i32> %tmp4, %tmp5
402 store <4 x i32> %add, <4 x i32>* %X, align 8
406 ; Combine vuzp+vaddl->vpaddl
407 define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
408 ; CHECK-LABEL: addCombineToVPADDLq_u16:
410 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
411 ; CHECK-NEXT: vpaddl.u16 q8, q8
412 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
413 ; CHECK-NEXT: mov pc, lr
414 %tmp = load <8 x i16>, <8 x i16>* %cbcr
415 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
416 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
417 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
418 %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
419 %add = add <4 x i32> %tmp4, %tmp5
420 store <4 x i32> %add, <4 x i32>* %X, align 8
424 ; Combine vtrn+vaddl->vpaddl
425 define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
426 ; CHECK-LABEL: addCombineToVPADDLq_s32:
428 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
429 ; CHECK-NEXT: vpaddl.s32 q8, q8
430 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
431 ; CHECK-NEXT: mov pc, lr
432 %tmp = load <4 x i32>, <4 x i32>* %cbcr
433 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
434 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
435 %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
436 %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
437 %add = add <2 x i64> %tmp4, %tmp5
438 store <2 x i64> %add, <2 x i64>* %X, align 8
442 ; Combine vtrn+vaddl->vpaddl
443 define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
444 ; CHECK-LABEL: addCombineToVPADDLq_u32:
446 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
447 ; CHECK-NEXT: vpaddl.u32 q8, q8
448 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
449 ; CHECK-NEXT: mov pc, lr
450 %tmp = load <4 x i32>, <4 x i32>* %cbcr
451 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
452 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
453 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
454 %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
455 %add = add <2 x i64> %tmp4, %tmp5
456 store <2 x i64> %add, <2 x i64>* %X, align 8
460 ; Legalization promotes the <4 x i8> to <4 x i16>.
461 define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
462 ; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
464 ; CHECK-NEXT: vmov d16, r0, r1
465 ; CHECK-NEXT: vpaddl.s8 d16, d16
466 ; CHECK-NEXT: vmov r0, r1, d16
467 ; CHECK-NEXT: mov pc, lr
468 %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
469 %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
470 %x = add <4 x i8> %tmp2, %tmp1
474 ; Legalization promotes the <2 x i16> to <2 x i32>.
475 define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
476 ; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
478 ; CHECK-NEXT: vmov d16, r0, r1
479 ; CHECK-NEXT: vpaddl.s16 d16, d16
480 ; CHECK-NEXT: vmov r0, r1, d16
481 ; CHECK-NEXT: mov pc, lr
482 %tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
483 %tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
484 %x = add <2 x i16> %tmp2, %tmp1
488 ; And <2 x i8> to <2 x i32>
489 define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) {
490 ; CHECK-LABEL: fromExtendingExtractVectorElt_2i8:
492 %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2>
493 %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3>
494 %x = add <2 x i8> %tmp2, %tmp1
498 define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) {
499 ; CHECK-LABEL: fromExtendingExtractVectorElt_2i16:
501 %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
502 %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3>
503 %x = add <2 x i16> %tmp2, %tmp1
508 declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
509 declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
510 declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
512 declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
513 declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
514 declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
516 declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
517 declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
518 declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
520 declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
521 declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
522 declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone