1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 ; CHECK-LABEL: shadd8b:
7 ; CHECK-NEXT: ldr d0, [x0]
8 ; CHECK-NEXT: ldr d1, [x1]
9 ; CHECK-NEXT: shadd.8b v0, v0, v1
11 %tmp1 = load <8 x i8>, <8 x i8>* %A
12 %tmp2 = load <8 x i8>, <8 x i8>* %B
13 %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
18 ; CHECK-LABEL: shadd16b:
20 ; CHECK-NEXT: ldr q0, [x0]
21 ; CHECK-NEXT: ldr q1, [x1]
22 ; CHECK-NEXT: shadd.16b v0, v0, v1
24 %tmp1 = load <16 x i8>, <16 x i8>* %A
25 %tmp2 = load <16 x i8>, <16 x i8>* %B
26 %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
30 define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
31 ; CHECK-LABEL: shadd4h:
33 ; CHECK-NEXT: ldr d0, [x0]
34 ; CHECK-NEXT: ldr d1, [x1]
35 ; CHECK-NEXT: shadd.4h v0, v0, v1
37 %tmp1 = load <4 x i16>, <4 x i16>* %A
38 %tmp2 = load <4 x i16>, <4 x i16>* %B
39 %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
43 define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
44 ; CHECK-LABEL: shadd8h:
46 ; CHECK-NEXT: ldr q0, [x0]
47 ; CHECK-NEXT: ldr q1, [x1]
48 ; CHECK-NEXT: shadd.8h v0, v0, v1
50 %tmp1 = load <8 x i16>, <8 x i16>* %A
51 %tmp2 = load <8 x i16>, <8 x i16>* %B
52 %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
56 define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
57 ; CHECK-LABEL: shadd2s:
59 ; CHECK-NEXT: ldr d0, [x0]
60 ; CHECK-NEXT: ldr d1, [x1]
61 ; CHECK-NEXT: shadd.2s v0, v0, v1
63 %tmp1 = load <2 x i32>, <2 x i32>* %A
64 %tmp2 = load <2 x i32>, <2 x i32>* %B
65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
69 define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
70 ; CHECK-LABEL: shadd4s:
72 ; CHECK-NEXT: ldr q0, [x0]
73 ; CHECK-NEXT: ldr q1, [x1]
74 ; CHECK-NEXT: shadd.4s v0, v0, v1
76 %tmp1 = load <4 x i32>, <4 x i32>* %A
77 %tmp2 = load <4 x i32>, <4 x i32>* %B
78 %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
82 define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
83 ; CHECK-LABEL: uhadd8b:
85 ; CHECK-NEXT: ldr d0, [x0]
86 ; CHECK-NEXT: ldr d1, [x1]
87 ; CHECK-NEXT: uhadd.8b v0, v0, v1
89 %tmp1 = load <8 x i8>, <8 x i8>* %A
90 %tmp2 = load <8 x i8>, <8 x i8>* %B
91 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
95 define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
96 ; CHECK-LABEL: uhadd16b:
98 ; CHECK-NEXT: ldr q0, [x0]
99 ; CHECK-NEXT: ldr q1, [x1]
100 ; CHECK-NEXT: uhadd.16b v0, v0, v1
102 %tmp1 = load <16 x i8>, <16 x i8>* %A
103 %tmp2 = load <16 x i8>, <16 x i8>* %B
104 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
108 define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
109 ; CHECK-LABEL: uhadd4h:
111 ; CHECK-NEXT: ldr d0, [x0]
112 ; CHECK-NEXT: ldr d1, [x1]
113 ; CHECK-NEXT: uhadd.4h v0, v0, v1
115 %tmp1 = load <4 x i16>, <4 x i16>* %A
116 %tmp2 = load <4 x i16>, <4 x i16>* %B
117 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121 define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
122 ; CHECK-LABEL: uhadd8h:
124 ; CHECK-NEXT: ldr q0, [x0]
125 ; CHECK-NEXT: ldr q1, [x1]
126 ; CHECK-NEXT: uhadd.8h v0, v0, v1
128 %tmp1 = load <8 x i16>, <8 x i16>* %A
129 %tmp2 = load <8 x i16>, <8 x i16>* %B
130 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
134 define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
135 ; CHECK-LABEL: uhadd2s:
137 ; CHECK-NEXT: ldr d0, [x0]
138 ; CHECK-NEXT: ldr d1, [x1]
139 ; CHECK-NEXT: uhadd.2s v0, v0, v1
141 %tmp1 = load <2 x i32>, <2 x i32>* %A
142 %tmp2 = load <2 x i32>, <2 x i32>* %B
143 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
147 define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
148 ; CHECK-LABEL: uhadd4s:
150 ; CHECK-NEXT: ldr q0, [x0]
151 ; CHECK-NEXT: ldr q1, [x1]
152 ; CHECK-NEXT: uhadd.4s v0, v0, v1
154 %tmp1 = load <4 x i32>, <4 x i32>* %A
155 %tmp2 = load <4 x i32>, <4 x i32>* %B
156 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
160 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
161 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
162 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
164 declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
165 declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
166 declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
168 declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
169 declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
170 declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
172 declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
173 declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
174 declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
176 define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
177 ; CHECK-LABEL: srhadd8b:
179 ; CHECK-NEXT: ldr d0, [x0]
180 ; CHECK-NEXT: ldr d1, [x1]
181 ; CHECK-NEXT: srhadd.8b v0, v0, v1
183 %tmp1 = load <8 x i8>, <8 x i8>* %A
184 %tmp2 = load <8 x i8>, <8 x i8>* %B
185 %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
189 define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
190 ; CHECK-LABEL: srhadd16b:
192 ; CHECK-NEXT: ldr q0, [x0]
193 ; CHECK-NEXT: ldr q1, [x1]
194 ; CHECK-NEXT: srhadd.16b v0, v0, v1
196 %tmp1 = load <16 x i8>, <16 x i8>* %A
197 %tmp2 = load <16 x i8>, <16 x i8>* %B
198 %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
202 define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
203 ; CHECK-LABEL: srhadd4h:
205 ; CHECK-NEXT: ldr d0, [x0]
206 ; CHECK-NEXT: ldr d1, [x1]
207 ; CHECK-NEXT: srhadd.4h v0, v0, v1
209 %tmp1 = load <4 x i16>, <4 x i16>* %A
210 %tmp2 = load <4 x i16>, <4 x i16>* %B
211 %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
215 define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
216 ; CHECK-LABEL: srhadd8h:
218 ; CHECK-NEXT: ldr q0, [x0]
219 ; CHECK-NEXT: ldr q1, [x1]
220 ; CHECK-NEXT: srhadd.8h v0, v0, v1
222 %tmp1 = load <8 x i16>, <8 x i16>* %A
223 %tmp2 = load <8 x i16>, <8 x i16>* %B
224 %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
228 define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
229 ; CHECK-LABEL: srhadd2s:
231 ; CHECK-NEXT: ldr d0, [x0]
232 ; CHECK-NEXT: ldr d1, [x1]
233 ; CHECK-NEXT: srhadd.2s v0, v0, v1
235 %tmp1 = load <2 x i32>, <2 x i32>* %A
236 %tmp2 = load <2 x i32>, <2 x i32>* %B
237 %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
241 define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
242 ; CHECK-LABEL: srhadd4s:
244 ; CHECK-NEXT: ldr q0, [x0]
245 ; CHECK-NEXT: ldr q1, [x1]
246 ; CHECK-NEXT: srhadd.4s v0, v0, v1
248 %tmp1 = load <4 x i32>, <4 x i32>* %A
249 %tmp2 = load <4 x i32>, <4 x i32>* %B
250 %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
254 define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
255 ; CHECK-LABEL: urhadd8b:
257 ; CHECK-NEXT: ldr d0, [x0]
258 ; CHECK-NEXT: ldr d1, [x1]
259 ; CHECK-NEXT: urhadd.8b v0, v0, v1
261 %tmp1 = load <8 x i8>, <8 x i8>* %A
262 %tmp2 = load <8 x i8>, <8 x i8>* %B
263 %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
267 define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
268 ; CHECK-LABEL: urhadd16b:
270 ; CHECK-NEXT: ldr q0, [x0]
271 ; CHECK-NEXT: ldr q1, [x1]
272 ; CHECK-NEXT: urhadd.16b v0, v0, v1
274 %tmp1 = load <16 x i8>, <16 x i8>* %A
275 %tmp2 = load <16 x i8>, <16 x i8>* %B
276 %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
280 define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
281 ; CHECK-LABEL: urhadd4h:
283 ; CHECK-NEXT: ldr d0, [x0]
284 ; CHECK-NEXT: ldr d1, [x1]
285 ; CHECK-NEXT: urhadd.4h v0, v0, v1
287 %tmp1 = load <4 x i16>, <4 x i16>* %A
288 %tmp2 = load <4 x i16>, <4 x i16>* %B
289 %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
293 define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
294 ; CHECK-LABEL: urhadd8h:
296 ; CHECK-NEXT: ldr q0, [x0]
297 ; CHECK-NEXT: ldr q1, [x1]
298 ; CHECK-NEXT: urhadd.8h v0, v0, v1
300 %tmp1 = load <8 x i16>, <8 x i16>* %A
301 %tmp2 = load <8 x i16>, <8 x i16>* %B
302 %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
306 define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
307 ; CHECK-LABEL: urhadd2s:
309 ; CHECK-NEXT: ldr d0, [x0]
310 ; CHECK-NEXT: ldr d1, [x1]
311 ; CHECK-NEXT: urhadd.2s v0, v0, v1
313 %tmp1 = load <2 x i32>, <2 x i32>* %A
314 %tmp2 = load <2 x i32>, <2 x i32>* %B
315 %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
319 define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
320 ; CHECK-LABEL: urhadd4s:
322 ; CHECK-NEXT: ldr q0, [x0]
323 ; CHECK-NEXT: ldr q1, [x1]
324 ; CHECK-NEXT: urhadd.4s v0, v0, v1
326 %tmp1 = load <4 x i32>, <4 x i32>* %A
327 %tmp2 = load <4 x i32>, <4 x i32>* %B
328 %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
332 define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
333 ; CHECK-LABEL: testLowerToSRHADD8b:
335 ; CHECK-NEXT: srhadd.8b v0, v0, v1
336 ; CHECK-NEXT: str d0, [x0]
338 %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
339 %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
340 %add1 = add <8 x i16> %sextsrc1, %sextsrc2
341 %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
342 %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
343 %result = trunc <8 x i16> %resulti16 to <8 x i8>
344 store <8 x i8> %result, <8 x i8>* %dest, align 8
348 define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
349 ; CHECK-LABEL: testLowerToSRHADD4h:
351 ; CHECK-NEXT: srhadd.4h v0, v0, v1
352 ; CHECK-NEXT: str d0, [x0]
354 %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
355 %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
356 %add1 = add <4 x i32> %sextsrc1, %sextsrc2
357 %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
358 %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
359 %result = trunc <4 x i32> %resulti16 to <4 x i16>
360 store <4 x i16> %result, <4 x i16>* %dest, align 8
364 define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
365 ; CHECK-LABEL: testLowerToSRHADD2s:
367 ; CHECK-NEXT: srhadd.2s v0, v0, v1
368 ; CHECK-NEXT: str d0, [x0]
370 %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
371 %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
372 %add1 = add <2 x i64> %sextsrc1, %sextsrc2
373 %add2 = add <2 x i64> %add1, <i64 1, i64 1>
374 %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
375 %result = trunc <2 x i64> %resulti16 to <2 x i32>
376 store <2 x i32> %result, <2 x i32>* %dest, align 8
380 define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
381 ; CHECK-LABEL: testLowerToSRHADD16b:
383 ; CHECK-NEXT: srhadd.16b v0, v0, v1
384 ; CHECK-NEXT: str q0, [x0]
386 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
387 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
388 %add1 = add <16 x i16> %sextsrc1, %sextsrc2
389 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
390 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
391 %result = trunc <16 x i16> %resulti16 to <16 x i8>
392 store <16 x i8> %result, <16 x i8>* %dest, align 16
396 define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
397 ; CHECK-LABEL: testLowerToSRHADD8h:
399 ; CHECK-NEXT: srhadd.8h v0, v0, v1
400 ; CHECK-NEXT: str q0, [x0]
402 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
403 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
404 %add1 = add <8 x i32> %sextsrc1, %sextsrc2
405 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
406 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
407 %result = trunc <8 x i32> %resulti16 to <8 x i16>
408 store <8 x i16> %result, <8 x i16>* %dest, align 16
412 define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
413 ; CHECK-LABEL: testLowerToSRHADD4s:
415 ; CHECK-NEXT: srhadd.4s v0, v0, v1
416 ; CHECK-NEXT: str q0, [x0]
418 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
419 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
420 %add1 = add <4 x i64> %sextsrc1, %sextsrc2
421 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
422 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
423 %result = trunc <4 x i64> %resulti16 to <4 x i32>
424 store <4 x i32> %result, <4 x i32>* %dest, align 16
428 define void @testLowerToSHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
429 ; CHECK-LABEL: testLowerToSHADD8b:
431 ; CHECK-NEXT: shadd.8b v0, v0, v1
432 ; CHECK-NEXT: str d0, [x0]
434 %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
435 %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
436 %add = add <8 x i16> %sextsrc1, %sextsrc2
437 %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
438 %result = trunc <8 x i16> %resulti16 to <8 x i8>
439 store <8 x i8> %result, <8 x i8>* %dest, align 8
443 define void @testLowerToSHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
444 ; CHECK-LABEL: testLowerToSHADD4h:
446 ; CHECK-NEXT: shadd.4h v0, v0, v1
447 ; CHECK-NEXT: str d0, [x0]
449 %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
450 %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
451 %add = add <4 x i32> %sextsrc1, %sextsrc2
452 %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
453 %result = trunc <4 x i32> %resulti16 to <4 x i16>
454 store <4 x i16> %result, <4 x i16>* %dest, align 8
458 define void @testLowerToSHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
459 ; CHECK-LABEL: testLowerToSHADD2s:
461 ; CHECK-NEXT: shadd.2s v0, v0, v1
462 ; CHECK-NEXT: str d0, [x0]
464 %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
465 %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
466 %add = add <2 x i64> %sextsrc1, %sextsrc2
467 %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
468 %result = trunc <2 x i64> %resulti16 to <2 x i32>
469 store <2 x i32> %result, <2 x i32>* %dest, align 8
473 define void @testLowerToSHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
474 ; CHECK-LABEL: testLowerToSHADD16b:
476 ; CHECK-NEXT: shadd.16b v0, v0, v1
477 ; CHECK-NEXT: str q0, [x0]
479 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
480 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
481 %add = add <16 x i16> %sextsrc1, %sextsrc2
482 %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
483 %result = trunc <16 x i16> %resulti16 to <16 x i8>
484 store <16 x i8> %result, <16 x i8>* %dest, align 16
488 define void @testLowerToSHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
489 ; CHECK-LABEL: testLowerToSHADD8h:
491 ; CHECK-NEXT: shadd.8h v0, v0, v1
492 ; CHECK-NEXT: str q0, [x0]
494 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
495 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
496 %add = add <8 x i32> %sextsrc1, %sextsrc2
497 %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
498 %result = trunc <8 x i32> %resulti16 to <8 x i16>
499 store <8 x i16> %result, <8 x i16>* %dest, align 16
503 define void @testLowerToSHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
504 ; CHECK-LABEL: testLowerToSHADD4s:
506 ; CHECK-NEXT: shadd.4s v0, v0, v1
507 ; CHECK-NEXT: str q0, [x0]
509 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
510 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
511 %add = add <4 x i64> %sextsrc1, %sextsrc2
512 %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
513 %result = trunc <4 x i64> %resulti16 to <4 x i32>
514 store <4 x i32> %result, <4 x i32>* %dest, align 16
518 define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
519 ; CHECK-LABEL: testLowerToURHADD8b:
521 ; CHECK-NEXT: urhadd.8b v0, v0, v1
522 ; CHECK-NEXT: str d0, [x0]
524 %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
525 %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
526 %add1 = add <8 x i16> %zextsrc1, %zextsrc2
527 %add2 = add <8 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
528 %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
529 %result = trunc <8 x i16> %resulti16 to <8 x i8>
530 store <8 x i8> %result, <8 x i8>* %dest, align 8
534 define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
535 ; CHECK-LABEL: testLowerToURHADD4h:
537 ; CHECK-NEXT: urhadd.4h v0, v0, v1
538 ; CHECK-NEXT: str d0, [x0]
540 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
541 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
542 %add1 = add <4 x i32> %zextsrc1, %zextsrc2
543 %add2 = add <4 x i32> %add1, <i32 1, i32 1, i32 1, i32 1>
544 %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
545 %result = trunc <4 x i32> %resulti16 to <4 x i16>
546 store <4 x i16> %result, <4 x i16>* %dest, align 8
550 define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
551 ; CHECK-LABEL: testLowerToURHADD2s:
553 ; CHECK-NEXT: urhadd.2s v0, v0, v1
554 ; CHECK-NEXT: str d0, [x0]
556 %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
557 %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
558 %add1 = add <2 x i64> %zextsrc1, %zextsrc2
559 %add2 = add <2 x i64> %add1, <i64 1, i64 1>
560 %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
561 %result = trunc <2 x i64> %resulti16 to <2 x i32>
562 store <2 x i32> %result, <2 x i32>* %dest, align 8
566 define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
567 ; CHECK-LABEL: testLowerToURHADD16b:
569 ; CHECK-NEXT: urhadd.16b v0, v0, v1
570 ; CHECK-NEXT: str q0, [x0]
572 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
573 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
574 %add1 = add <16 x i16> %zextsrc1, %zextsrc2
575 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
576 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
577 %result = trunc <16 x i16> %resulti16 to <16 x i8>
578 store <16 x i8> %result, <16 x i8>* %dest, align 16
582 define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
583 ; CHECK-LABEL: testLowerToURHADD8h:
585 ; CHECK-NEXT: urhadd.8h v0, v0, v1
586 ; CHECK-NEXT: str q0, [x0]
588 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
589 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
590 %add1 = add <8 x i32> %zextsrc1, %zextsrc2
591 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
592 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
593 %result = trunc <8 x i32> %resulti16 to <8 x i16>
594 store <8 x i16> %result, <8 x i16>* %dest, align 16
598 define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
599 ; CHECK-LABEL: testLowerToURHADD4s:
601 ; CHECK-NEXT: urhadd.4s v0, v0, v1
602 ; CHECK-NEXT: str q0, [x0]
604 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
605 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
606 %add1 = add <4 x i64> %zextsrc1, %zextsrc2
607 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
608 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
609 %result = trunc <4 x i64> %resulti16 to <4 x i32>
610 store <4 x i32> %result, <4 x i32>* %dest, align 16
614 define void @testLowerToUHADD8b(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
615 ; CHECK-LABEL: testLowerToUHADD8b:
617 ; CHECK-NEXT: uhadd.8b v0, v0, v1
618 ; CHECK-NEXT: str d0, [x0]
620 %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
621 %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
622 %add = add <8 x i16> %zextsrc1, %zextsrc2
623 %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
624 %result = trunc <8 x i16> %resulti16 to <8 x i8>
625 store <8 x i8> %result, <8 x i8>* %dest, align 8
629 define void @testLowerToUHADD4h(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
630 ; CHECK-LABEL: testLowerToUHADD4h:
632 ; CHECK-NEXT: uhadd.4h v0, v0, v1
633 ; CHECK-NEXT: str d0, [x0]
635 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
636 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
637 %add = add <4 x i32> %zextsrc1, %zextsrc2
638 %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
639 %result = trunc <4 x i32> %resulti16 to <4 x i16>
640 store <4 x i16> %result, <4 x i16>* %dest, align 8
644 define void @testLowerToUHADD2s(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
645 ; CHECK-LABEL: testLowerToUHADD2s:
647 ; CHECK-NEXT: uhadd.2s v0, v0, v1
648 ; CHECK-NEXT: str d0, [x0]
650 %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
651 %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
652 %add = add <2 x i64> %zextsrc1, %zextsrc2
653 %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
654 %result = trunc <2 x i64> %resulti16 to <2 x i32>
655 store <2 x i32> %result, <2 x i32>* %dest, align 8
659 define void @testLowerToUHADD16b(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
660 ; CHECK-LABEL: testLowerToUHADD16b:
662 ; CHECK-NEXT: uhadd.16b v0, v0, v1
663 ; CHECK-NEXT: str q0, [x0]
665 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
666 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
667 %add = add <16 x i16> %zextsrc1, %zextsrc2
668 %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
669 %result = trunc <16 x i16> %resulti16 to <16 x i8>
670 store <16 x i8> %result, <16 x i8>* %dest, align 16
674 define void @testLowerToUHADD8h(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
675 ; CHECK-LABEL: testLowerToUHADD8h:
677 ; CHECK-NEXT: uhadd.8h v0, v0, v1
678 ; CHECK-NEXT: str q0, [x0]
680 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
681 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
682 %add = add <8 x i32> %zextsrc1, %zextsrc2
683 %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
684 %result = trunc <8 x i32> %resulti16 to <8 x i16>
685 store <8 x i16> %result, <8 x i16>* %dest, align 16
689 define void @testLowerToUHADD4s(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
690 ; CHECK-LABEL: testLowerToUHADD4s:
692 ; CHECK-NEXT: uhadd.4s v0, v0, v1
693 ; CHECK-NEXT: str q0, [x0]
695 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
696 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
697 %add = add <4 x i64> %zextsrc1, %zextsrc2
698 %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
699 %result = trunc <4 x i64> %resulti16 to <4 x i32>
700 store <4 x i32> %result, <4 x i32>* %dest, align 16
705 define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
706 ; CHECK-LABEL: hadd16_sext_asr:
708 ; CHECK-NEXT: saddl.4s v0, v0, v1
709 ; CHECK-NEXT: sshr.4s v0, v0, #1
711 %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
712 %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
713 %add = add <4 x i32> %zextsrc1, %zextsrc2
714 %resulti16 = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
715 ret <4 x i32> %resulti16
718 define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
719 ; CHECK-LABEL: hadd16_zext_asr:
721 ; CHECK-NEXT: uaddl.4s v0, v0, v1
722 ; CHECK-NEXT: ushr.4s v0, v0, #1
724 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
725 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
726 %add = add <4 x i32> %zextsrc1, %zextsrc2
727 %resulti16 = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
728 ret <4 x i32> %resulti16
731 define <4 x i32> @hadd16_sext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
732 ; CHECK-LABEL: hadd16_sext_lsr:
734 ; CHECK-NEXT: saddl.4s v0, v0, v1
735 ; CHECK-NEXT: ushr.4s v0, v0, #1
737 %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
738 %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
739 %add = add <4 x i32> %zextsrc1, %zextsrc2
740 %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
741 ret <4 x i32> %resulti16
744 define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) nounwind {
745 ; CHECK-LABEL: hadd16_zext_lsr:
747 ; CHECK-NEXT: uaddl.4s v0, v0, v1
748 ; CHECK-NEXT: ushr.4s v0, v0, #1
750 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
751 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
752 %add = add <4 x i32> %zextsrc1, %zextsrc2
753 %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
754 ret <4 x i32> %resulti16
759 define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
760 ; CHECK-LABEL: hadd32_sext_asr:
762 ; CHECK-NEXT: saddl.2d v2, v0, v1
763 ; CHECK-NEXT: saddl2.2d v0, v0, v1
764 ; CHECK-NEXT: sshr.2d v1, v0, #1
765 ; CHECK-NEXT: sshr.2d v0, v2, #1
767 %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
768 %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
769 %add = add <4 x i64> %zextsrc1, %zextsrc2
770 %resulti32 = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
771 ret <4 x i64> %resulti32
774 define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
775 ; CHECK-LABEL: hadd32_zext_asr:
777 ; CHECK-NEXT: uaddl.2d v2, v0, v1
778 ; CHECK-NEXT: uaddl2.2d v0, v0, v1
779 ; CHECK-NEXT: ushr.2d v1, v0, #1
780 ; CHECK-NEXT: ushr.2d v0, v2, #1
782 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
783 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
784 %add = add <4 x i64> %zextsrc1, %zextsrc2
785 %resulti32 = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
786 ret <4 x i64> %resulti32
789 define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
790 ; CHECK-LABEL: hadd32_sext_lsr:
792 ; CHECK-NEXT: saddl.2d v2, v0, v1
793 ; CHECK-NEXT: saddl2.2d v0, v0, v1
794 ; CHECK-NEXT: ushr.2d v1, v0, #1
795 ; CHECK-NEXT: ushr.2d v0, v2, #1
797 %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
798 %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
799 %add = add <4 x i64> %zextsrc1, %zextsrc2
800 %resulti32 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
801 ret <4 x i64> %resulti32
804 define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
805 ; CHECK-LABEL: hadd32_zext_lsr:
807 ; CHECK-NEXT: uaddl.2d v2, v0, v1
808 ; CHECK-NEXT: uaddl2.2d v0, v0, v1
809 ; CHECK-NEXT: ushr.2d v1, v0, #1
810 ; CHECK-NEXT: ushr.2d v0, v2, #1
812 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
813 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
814 %add = add <4 x i64> %zextsrc1, %zextsrc2
815 %resulti32 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
816 ret <4 x i64> %resulti32
820 declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
821 declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
822 declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
824 declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
825 declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
826 declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
828 declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
829 declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
830 declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
832 declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
833 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
834 declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone