1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,DAG %s
3 ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK
5 ; FALLBACK-NOT: remark:{{.*}} sabdl8h
6 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7 ; CHECK-LABEL: sabdl8h:
9 ; CHECK-NEXT: ldr d0, [x0]
10 ; CHECK-NEXT: ldr d1, [x1]
11 ; CHECK-NEXT: sabdl.8h v0, v0, v1
13 %tmp1 = load <8 x i8>, <8 x i8>* %A
14 %tmp2 = load <8 x i8>, <8 x i8>* %B
15 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
16 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
20 ; FALLBACK-NOT: remark:{{.*}} sabdl4s
21 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
22 ; CHECK-LABEL: sabdl4s:
24 ; CHECK-NEXT: ldr d0, [x0]
25 ; CHECK-NEXT: ldr d1, [x1]
26 ; CHECK-NEXT: sabdl.4s v0, v0, v1
28 %tmp1 = load <4 x i16>, <4 x i16>* %A
29 %tmp2 = load <4 x i16>, <4 x i16>* %B
30 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
31 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
35 ; FALLBACK-NOT: remark:{{.*}} sabdl2d
36 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
37 ; CHECK-LABEL: sabdl2d:
39 ; CHECK-NEXT: ldr d0, [x0]
40 ; CHECK-NEXT: ldr d1, [x1]
41 ; CHECK-NEXT: sabdl.2d v0, v0, v1
43 %tmp1 = load <2 x i32>, <2 x i32>* %A
44 %tmp2 = load <2 x i32>, <2 x i32>* %B
45 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
46 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
50 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
51 ; CHECK-LABEL: sabdl2_8h:
53 ; CHECK-NEXT: ldr d0, [x0, #8]
54 ; CHECK-NEXT: ldr d1, [x1, #8]
55 ; CHECK-NEXT: sabdl.8h v0, v0, v1
57 %load1 = load <16 x i8>, <16 x i8>* %A
58 %load2 = load <16 x i8>, <16 x i8>* %B
59 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
60 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
61 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
62 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
66 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
67 ; CHECK-LABEL: sabdl2_4s:
69 ; CHECK-NEXT: ldr d0, [x0, #8]
70 ; CHECK-NEXT: ldr d1, [x1, #8]
71 ; CHECK-NEXT: sabdl.4s v0, v0, v1
73 %load1 = load <8 x i16>, <8 x i16>* %A
74 %load2 = load <8 x i16>, <8 x i16>* %B
75 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
76 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
77 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
78 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
82 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
83 ; CHECK-LABEL: sabdl2_2d:
85 ; CHECK-NEXT: ldr d0, [x0, #8]
86 ; CHECK-NEXT: ldr d1, [x1, #8]
87 ; CHECK-NEXT: sabdl.2d v0, v0, v1
89 %load1 = load <4 x i32>, <4 x i32>* %A
90 %load2 = load <4 x i32>, <4 x i32>* %B
91 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
92 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
93 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
94 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
98 ; FALLBACK-NOT: remark:{{.*}} uabdl8h)
99 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
100 ; CHECK-LABEL: uabdl8h:
102 ; CHECK-NEXT: ldr d0, [x0]
103 ; CHECK-NEXT: ldr d1, [x1]
104 ; CHECK-NEXT: uabdl.8h v0, v0, v1
106 %tmp1 = load <8 x i8>, <8 x i8>* %A
107 %tmp2 = load <8 x i8>, <8 x i8>* %B
108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
113 ; FALLBACK-NOT: remark:{{.*}} uabdl4s)
114 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
115 ; CHECK-LABEL: uabdl4s:
117 ; CHECK-NEXT: ldr d0, [x0]
118 ; CHECK-NEXT: ldr d1, [x1]
119 ; CHECK-NEXT: uabdl.4s v0, v0, v1
121 %tmp1 = load <4 x i16>, <4 x i16>* %A
122 %tmp2 = load <4 x i16>, <4 x i16>* %B
123 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
124 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
128 ; FALLBACK-NOT: remark:{{.*}} uabdl2d)
129 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
130 ; CHECK-LABEL: uabdl2d:
132 ; CHECK-NEXT: ldr d0, [x0]
133 ; CHECK-NEXT: ldr d1, [x1]
134 ; CHECK-NEXT: uabdl.2d v0, v0, v1
136 %tmp1 = load <2 x i32>, <2 x i32>* %A
137 %tmp2 = load <2 x i32>, <2 x i32>* %B
138 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
139 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
143 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
144 ; CHECK-LABEL: uabdl2_8h:
146 ; CHECK-NEXT: ldr d0, [x0, #8]
147 ; CHECK-NEXT: ldr d1, [x1, #8]
148 ; CHECK-NEXT: uabdl.8h v0, v0, v1
150 %load1 = load <16 x i8>, <16 x i8>* %A
151 %load2 = load <16 x i8>, <16 x i8>* %B
152 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
153 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
155 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
156 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
160 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
161 ; CHECK-LABEL: uabdl2_4s:
163 ; CHECK-NEXT: ldr d0, [x0, #8]
164 ; CHECK-NEXT: ldr d1, [x1, #8]
165 ; CHECK-NEXT: uabdl.4s v0, v0, v1
167 %load1 = load <8 x i16>, <8 x i16>* %A
168 %load2 = load <8 x i16>, <8 x i16>* %B
169 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
170 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
171 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
172 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
176 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
177 ; CHECK-LABEL: uabdl2_2d:
179 ; CHECK-NEXT: ldr d0, [x0, #8]
180 ; CHECK-NEXT: ldr d1, [x1, #8]
181 ; CHECK-NEXT: uabdl.2d v0, v0, v1
183 %load1 = load <4 x i32>, <4 x i32>* %A
184 %load2 = load <4 x i32>, <4 x i32>* %B
185 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
186 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
187 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
188 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
192 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
193 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
195 define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) {
196 ; CHECK-LABEL: uabd16b_rdx:
198 ; CHECK-NEXT: ldr q0, [x0]
199 ; CHECK-NEXT: ldr q1, [x1]
200 ; CHECK-NEXT: uabd.16b v0, v0, v1
201 ; CHECK-NEXT: ushll.8h v1, v0, #0
202 ; CHECK-NEXT: uaddw2.8h v0, v1, v0
203 ; CHECK-NEXT: addv.8h h0, v0
204 ; CHECK-NEXT: fmov w0, s0
206 %aload = load <16 x i8>, <16 x i8>* %a, align 1
207 %bload = load <16 x i8>, <16 x i8>* %b, align 1
208 %aext = zext <16 x i8> %aload to <16 x i16>
209 %bext = zext <16 x i8> %bload to <16 x i16>
210 %abdiff = sub nsw <16 x i16> %aext, %bext
211 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
212 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
213 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
214 %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
218 define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
219 ; CHECK-LABEL: uabd16b_rdx_i32:
221 ; CHECK-NEXT: uabdl.8h v2, v0, v1
222 ; CHECK-NEXT: uabal2.8h v2, v0, v1
223 ; CHECK-NEXT: uaddlv.8h s0, v2
224 ; CHECK-NEXT: fmov w0, s0
226 %aext = zext <16 x i8> %a to <16 x i32>
227 %bext = zext <16 x i8> %b to <16 x i32>
228 %abdiff = sub nsw <16 x i32> %aext, %bext
229 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
230 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
231 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
232 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
236 define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
237 ; CHECK-LABEL: sabd16b_rdx_i32:
239 ; CHECK-NEXT: sabdl.8h v2, v0, v1
240 ; CHECK-NEXT: sabal2.8h v2, v0, v1
241 ; CHECK-NEXT: uaddlv.8h s0, v2
242 ; CHECK-NEXT: fmov w0, s0
244 %aext = sext <16 x i8> %a to <16 x i32>
245 %bext = sext <16 x i8> %b to <16 x i32>
246 %abdiff = sub nsw <16 x i32> %aext, %bext
247 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
248 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
249 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
250 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
255 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
256 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
258 define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) {
259 ; CHECK-LABEL: uabd8h_rdx:
261 ; CHECK-NEXT: ldr q0, [x0]
262 ; CHECK-NEXT: ldr q1, [x1]
263 ; CHECK-NEXT: uabd.8h v0, v0, v1
264 ; CHECK-NEXT: ushll.4s v1, v0, #0
265 ; CHECK-NEXT: uaddw2.4s v0, v1, v0
266 ; CHECK-NEXT: addv.4s s0, v0
267 ; CHECK-NEXT: fmov w0, s0
269 %aload = load <8 x i16>, <8 x i16>* %a, align 1
270 %bload = load <8 x i16>, <8 x i16>* %b, align 1
271 %aext = zext <8 x i16> %aload to <8 x i32>
272 %bext = zext <8 x i16> %bload to <8 x i32>
273 %abdiff = sub nsw <8 x i32> %aext, %bext
274 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
275 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
276 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
277 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
281 define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
282 ; CHECK-LABEL: sabd8h_rdx:
284 ; CHECK-NEXT: sabd.8h v0, v0, v1
285 ; CHECK-NEXT: ushll.4s v1, v0, #0
286 ; CHECK-NEXT: uaddw2.4s v0, v1, v0
287 ; CHECK-NEXT: addv.4s s0, v0
288 ; CHECK-NEXT: fmov w0, s0
290 %aext = sext <8 x i16> %a to <8 x i32>
291 %bext = sext <8 x i16> %b to <8 x i32>
292 %abdiff = sub nsw <8 x i32> %aext, %bext
293 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
294 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
295 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
296 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
300 define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
301 ; DAG-LABEL: uabdl4s_rdx_i32:
303 ; DAG-NEXT: uabdl.4s v0, v0, v1
304 ; DAG-NEXT: addv.4s s0, v0
305 ; DAG-NEXT: fmov w0, s0
308 ; GISEL-LABEL: uabdl4s_rdx_i32:
310 ; GISEL-NEXT: movi.2d v2, #0000000000000000
311 ; GISEL-NEXT: usubl.4s v0, v0, v1
312 ; GISEL-NEXT: cmgt.4s v1, v2, v0
313 ; GISEL-NEXT: shl.4s v1, v1, #31
314 ; GISEL-NEXT: neg.4s v2, v0
315 ; GISEL-NEXT: sshr.4s v1, v1, #31
316 ; GISEL-NEXT: bit.16b v0, v2, v1
317 ; GISEL-NEXT: addv.4s s0, v0
318 ; GISEL-NEXT: fmov w0, s0
321 ; GISel doesn't match this pattern yet.
322 %aext = zext <4 x i16> %a to <4 x i32>
323 %bext = zext <4 x i16> %b to <4 x i32>
324 %abdiff = sub nsw <4 x i32> %aext, %bext
325 %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer
326 %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff
327 %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff
328 %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel)
332 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
333 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
335 define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
336 ; CHECK-LABEL: uabd4s_rdx:
338 ; CHECK-NEXT: ldr q0, [x0]
339 ; CHECK-NEXT: ldr q1, [x1]
340 ; CHECK-NEXT: uabd.4s v0, v0, v1
341 ; CHECK-NEXT: ushll.2d v1, v0, #0
342 ; CHECK-NEXT: uaddw2.2d v0, v1, v0
343 ; CHECK-NEXT: addp.2d d0, v0
344 ; CHECK-NEXT: fmov x0, d0
346 %aload = load <4 x i32>, <4 x i32>* %a, align 1
347 %bload = load <4 x i32>, <4 x i32>* %b, align 1
348 %aext = zext <4 x i32> %aload to <4 x i64>
349 %bext = zext <4 x i32> %bload to <4 x i64>
350 %abdiff = sub nsw <4 x i64> %aext, %bext
351 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
352 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
353 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
354 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
358 define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
359 ; CHECK-LABEL: sabd4s_rdx:
361 ; CHECK-NEXT: sabd.4s v0, v0, v1
362 ; CHECK-NEXT: ushll.2d v1, v0, #0
363 ; CHECK-NEXT: uaddw2.2d v0, v1, v0
364 ; CHECK-NEXT: addp.2d d0, v0
365 ; CHECK-NEXT: fmov x0, d0
367 %aext = sext <4 x i32> %a to <4 x i64>
368 %bext = sext <4 x i32> %b to <4 x i64>
369 %abdiff = sub nsw <4 x i64> %aext, %bext
370 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
371 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
372 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
373 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
377 define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
378 ; DAG-LABEL: uabdl2d_rdx_i64:
380 ; DAG-NEXT: uabdl.2d v0, v0, v1
381 ; DAG-NEXT: addp.2d d0, v0
382 ; DAG-NEXT: fmov x0, d0
385 ; GISEL-LABEL: uabdl2d_rdx_i64:
387 ; GISEL-NEXT: movi.2d v2, #0000000000000000
388 ; GISEL-NEXT: usubl.2d v0, v0, v1
389 ; GISEL-NEXT: cmgt.2d v1, v2, v0
390 ; GISEL-NEXT: shl.2d v1, v1, #63
391 ; GISEL-NEXT: neg.2d v2, v0
392 ; GISEL-NEXT: sshr.2d v1, v1, #63
393 ; GISEL-NEXT: bit.16b v0, v2, v1
394 ; GISEL-NEXT: addp.2d d0, v0
395 ; GISEL-NEXT: fmov x0, d0
398 ; GISel doesn't match this pattern yet
399 %aext = zext <2 x i32> %a to <2 x i64>
400 %bext = zext <2 x i32> %b to <2 x i64>
401 %abdiff = sub nsw <2 x i64> %aext, %bext
402 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
403 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
404 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
405 %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel)
409 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
410 ; CHECK-LABEL: fabd_2s:
412 ; CHECK-NEXT: ldr d0, [x0]
413 ; CHECK-NEXT: ldr d1, [x1]
414 ; CHECK-NEXT: fabd.2s v0, v0, v1
416 %tmp1 = load <2 x float>, <2 x float>* %A
417 %tmp2 = load <2 x float>, <2 x float>* %B
418 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
419 ret <2 x float> %tmp3
422 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
423 ; CHECK-LABEL: fabd_4s:
425 ; CHECK-NEXT: ldr q0, [x0]
426 ; CHECK-NEXT: ldr q1, [x1]
427 ; CHECK-NEXT: fabd.4s v0, v0, v1
429 %tmp1 = load <4 x float>, <4 x float>* %A
430 %tmp2 = load <4 x float>, <4 x float>* %B
431 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
432 ret <4 x float> %tmp3
435 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
436 ; CHECK-LABEL: fabd_2d:
438 ; CHECK-NEXT: ldr q0, [x0]
439 ; CHECK-NEXT: ldr q1, [x1]
440 ; CHECK-NEXT: fabd.2d v0, v0, v1
442 %tmp1 = load <2 x double>, <2 x double>* %A
443 %tmp2 = load <2 x double>, <2 x double>* %B
444 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
445 ret <2 x double> %tmp3
448 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
449 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
450 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
452 define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
453 ; CHECK-LABEL: fabd_2s_from_fsub_fabs:
455 ; CHECK-NEXT: ldr d0, [x0]
456 ; CHECK-NEXT: ldr d1, [x1]
457 ; CHECK-NEXT: fabd.2s v0, v0, v1
459 %tmp1 = load <2 x float>, <2 x float>* %A
460 %tmp2 = load <2 x float>, <2 x float>* %B
461 %sub = fsub <2 x float> %tmp1, %tmp2
462 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
466 define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
467 ; CHECK-LABEL: fabd_4s_from_fsub_fabs:
469 ; CHECK-NEXT: ldr q0, [x0]
470 ; CHECK-NEXT: ldr q1, [x1]
471 ; CHECK-NEXT: fabd.4s v0, v0, v1
473 %tmp1 = load <4 x float>, <4 x float>* %A
474 %tmp2 = load <4 x float>, <4 x float>* %B
475 %sub = fsub <4 x float> %tmp1, %tmp2
476 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
480 define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
481 ; CHECK-LABEL: fabd_2d_from_fsub_fabs:
483 ; CHECK-NEXT: ldr q0, [x0]
484 ; CHECK-NEXT: ldr q1, [x1]
485 ; CHECK-NEXT: fabd.2d v0, v0, v1
487 %tmp1 = load <2 x double>, <2 x double>* %A
488 %tmp2 = load <2 x double>, <2 x double>* %B
489 %sub = fsub <2 x double> %tmp1, %tmp2
490 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
491 ret <2 x double> %abs
494 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
495 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
496 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
498 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
499 ; CHECK-LABEL: sabd_8b:
501 ; CHECK-NEXT: ldr d0, [x0]
502 ; CHECK-NEXT: ldr d1, [x1]
503 ; CHECK-NEXT: sabd.8b v0, v0, v1
505 %tmp1 = load <8 x i8>, <8 x i8>* %A
506 %tmp2 = load <8 x i8>, <8 x i8>* %B
507 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
511 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
512 ; CHECK-LABEL: sabd_16b:
514 ; CHECK-NEXT: ldr q0, [x0]
515 ; CHECK-NEXT: ldr q1, [x1]
516 ; CHECK-NEXT: sabd.16b v0, v0, v1
518 %tmp1 = load <16 x i8>, <16 x i8>* %A
519 %tmp2 = load <16 x i8>, <16 x i8>* %B
520 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
524 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
525 ; CHECK-LABEL: sabd_4h:
527 ; CHECK-NEXT: ldr d0, [x0]
528 ; CHECK-NEXT: ldr d1, [x1]
529 ; CHECK-NEXT: sabd.4h v0, v0, v1
531 %tmp1 = load <4 x i16>, <4 x i16>* %A
532 %tmp2 = load <4 x i16>, <4 x i16>* %B
533 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
537 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
538 ; CHECK-LABEL: sabd_8h:
540 ; CHECK-NEXT: ldr q0, [x0]
541 ; CHECK-NEXT: ldr q1, [x1]
542 ; CHECK-NEXT: sabd.8h v0, v0, v1
544 %tmp1 = load <8 x i16>, <8 x i16>* %A
545 %tmp2 = load <8 x i16>, <8 x i16>* %B
546 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
550 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
551 ; CHECK-LABEL: sabd_2s:
553 ; CHECK-NEXT: ldr d0, [x0]
554 ; CHECK-NEXT: ldr d1, [x1]
555 ; CHECK-NEXT: sabd.2s v0, v0, v1
557 %tmp1 = load <2 x i32>, <2 x i32>* %A
558 %tmp2 = load <2 x i32>, <2 x i32>* %B
559 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
563 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
564 ; CHECK-LABEL: sabd_4s:
566 ; CHECK-NEXT: ldr q0, [x0]
567 ; CHECK-NEXT: ldr q1, [x1]
568 ; CHECK-NEXT: sabd.4s v0, v0, v1
570 %tmp1 = load <4 x i32>, <4 x i32>* %A
571 %tmp2 = load <4 x i32>, <4 x i32>* %B
572 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
576 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
577 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
578 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
579 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
580 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
581 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
583 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
584 ; CHECK-LABEL: uabd_8b:
586 ; CHECK-NEXT: ldr d0, [x0]
587 ; CHECK-NEXT: ldr d1, [x1]
588 ; CHECK-NEXT: uabd.8b v0, v0, v1
590 %tmp1 = load <8 x i8>, <8 x i8>* %A
591 %tmp2 = load <8 x i8>, <8 x i8>* %B
592 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
596 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
597 ; CHECK-LABEL: uabd_16b:
599 ; CHECK-NEXT: ldr q0, [x0]
600 ; CHECK-NEXT: ldr q1, [x1]
601 ; CHECK-NEXT: uabd.16b v0, v0, v1
603 %tmp1 = load <16 x i8>, <16 x i8>* %A
604 %tmp2 = load <16 x i8>, <16 x i8>* %B
605 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
609 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
610 ; CHECK-LABEL: uabd_4h:
612 ; CHECK-NEXT: ldr d0, [x0]
613 ; CHECK-NEXT: ldr d1, [x1]
614 ; CHECK-NEXT: uabd.4h v0, v0, v1
616 %tmp1 = load <4 x i16>, <4 x i16>* %A
617 %tmp2 = load <4 x i16>, <4 x i16>* %B
618 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
622 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
623 ; CHECK-LABEL: uabd_8h:
625 ; CHECK-NEXT: ldr q0, [x0]
626 ; CHECK-NEXT: ldr q1, [x1]
627 ; CHECK-NEXT: uabd.8h v0, v0, v1
629 %tmp1 = load <8 x i16>, <8 x i16>* %A
630 %tmp2 = load <8 x i16>, <8 x i16>* %B
631 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
635 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
636 ; CHECK-LABEL: uabd_2s:
638 ; CHECK-NEXT: ldr d0, [x0]
639 ; CHECK-NEXT: ldr d1, [x1]
640 ; CHECK-NEXT: uabd.2s v0, v0, v1
642 %tmp1 = load <2 x i32>, <2 x i32>* %A
643 %tmp2 = load <2 x i32>, <2 x i32>* %B
644 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
648 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
649 ; CHECK-LABEL: uabd_4s:
651 ; CHECK-NEXT: ldr q0, [x0]
652 ; CHECK-NEXT: ldr q1, [x1]
653 ; CHECK-NEXT: uabd.4s v0, v0, v1
655 %tmp1 = load <4 x i32>, <4 x i32>* %A
656 %tmp2 = load <4 x i32>, <4 x i32>* %B
657 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
661 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
662 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
663 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
664 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
665 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
666 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
668 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
669 ; CHECK-LABEL: sqabs_8b:
671 ; CHECK-NEXT: ldr d0, [x0]
672 ; CHECK-NEXT: sqabs.8b v0, v0
674 %tmp1 = load <8 x i8>, <8 x i8>* %A
675 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
679 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
680 ; CHECK-LABEL: sqabs_16b:
682 ; CHECK-NEXT: ldr q0, [x0]
683 ; CHECK-NEXT: sqabs.16b v0, v0
685 %tmp1 = load <16 x i8>, <16 x i8>* %A
686 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
690 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
691 ; CHECK-LABEL: sqabs_4h:
693 ; CHECK-NEXT: ldr d0, [x0]
694 ; CHECK-NEXT: sqabs.4h v0, v0
696 %tmp1 = load <4 x i16>, <4 x i16>* %A
697 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
701 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
702 ; CHECK-LABEL: sqabs_8h:
704 ; CHECK-NEXT: ldr q0, [x0]
705 ; CHECK-NEXT: sqabs.8h v0, v0
707 %tmp1 = load <8 x i16>, <8 x i16>* %A
708 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
712 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
713 ; CHECK-LABEL: sqabs_2s:
715 ; CHECK-NEXT: ldr d0, [x0]
716 ; CHECK-NEXT: sqabs.2s v0, v0
718 %tmp1 = load <2 x i32>, <2 x i32>* %A
719 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
723 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
724 ; CHECK-LABEL: sqabs_4s:
726 ; CHECK-NEXT: ldr q0, [x0]
727 ; CHECK-NEXT: sqabs.4s v0, v0
729 %tmp1 = load <4 x i32>, <4 x i32>* %A
730 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
734 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
735 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
736 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
737 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
738 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
739 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
741 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
742 ; CHECK-LABEL: sqneg_8b:
744 ; CHECK-NEXT: ldr d0, [x0]
745 ; CHECK-NEXT: sqneg.8b v0, v0
747 %tmp1 = load <8 x i8>, <8 x i8>* %A
748 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
752 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
753 ; CHECK-LABEL: sqneg_16b:
755 ; CHECK-NEXT: ldr q0, [x0]
756 ; CHECK-NEXT: sqneg.16b v0, v0
758 %tmp1 = load <16 x i8>, <16 x i8>* %A
759 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
763 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
764 ; CHECK-LABEL: sqneg_4h:
766 ; CHECK-NEXT: ldr d0, [x0]
767 ; CHECK-NEXT: sqneg.4h v0, v0
769 %tmp1 = load <4 x i16>, <4 x i16>* %A
770 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
774 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
775 ; CHECK-LABEL: sqneg_8h:
777 ; CHECK-NEXT: ldr q0, [x0]
778 ; CHECK-NEXT: sqneg.8h v0, v0
780 %tmp1 = load <8 x i16>, <8 x i16>* %A
781 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
785 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
786 ; CHECK-LABEL: sqneg_2s:
788 ; CHECK-NEXT: ldr d0, [x0]
789 ; CHECK-NEXT: sqneg.2s v0, v0
791 %tmp1 = load <2 x i32>, <2 x i32>* %A
792 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
796 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
797 ; CHECK-LABEL: sqneg_4s:
799 ; CHECK-NEXT: ldr q0, [x0]
800 ; CHECK-NEXT: sqneg.4s v0, v0
802 %tmp1 = load <4 x i32>, <4 x i32>* %A
803 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
807 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
808 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
809 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
810 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
811 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
812 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
814 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
815 ; CHECK-LABEL: abs_8b:
817 ; CHECK-NEXT: ldr d0, [x0]
818 ; CHECK-NEXT: abs.8b v0, v0
820 %tmp1 = load <8 x i8>, <8 x i8>* %A
821 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
825 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
826 ; CHECK-LABEL: abs_16b:
828 ; CHECK-NEXT: ldr q0, [x0]
829 ; CHECK-NEXT: abs.16b v0, v0
831 %tmp1 = load <16 x i8>, <16 x i8>* %A
832 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
836 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
837 ; CHECK-LABEL: abs_4h:
839 ; CHECK-NEXT: ldr d0, [x0]
840 ; CHECK-NEXT: abs.4h v0, v0
842 %tmp1 = load <4 x i16>, <4 x i16>* %A
843 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
847 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
848 ; CHECK-LABEL: abs_8h:
850 ; CHECK-NEXT: ldr q0, [x0]
851 ; CHECK-NEXT: abs.8h v0, v0
853 %tmp1 = load <8 x i16>, <8 x i16>* %A
854 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
858 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
859 ; CHECK-LABEL: abs_2s:
861 ; CHECK-NEXT: ldr d0, [x0]
862 ; CHECK-NEXT: abs.2s v0, v0
864 %tmp1 = load <2 x i32>, <2 x i32>* %A
865 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
869 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
870 ; CHECK-LABEL: abs_4s:
872 ; CHECK-NEXT: ldr q0, [x0]
873 ; CHECK-NEXT: abs.4s v0, v0
875 %tmp1 = load <4 x i32>, <4 x i32>* %A
876 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
880 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
881 ; CHECK-LABEL: abs_1d:
883 ; CHECK-NEXT: abs d0, d0
885 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
889 define i64 @abs_1d_honestly(i64 %A) nounwind {
890 ; CHECK-LABEL: abs_1d_honestly:
892 ; CHECK-NEXT: fmov d0, x0
893 ; CHECK-NEXT: abs d0, d0
894 ; CHECK-NEXT: fmov x0, d0
896 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
900 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
901 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
902 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
903 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
904 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
905 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
906 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
907 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
909 ; FALLBACK-NOT: remark:{{.*}} sabal8h
910 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
911 ; CHECK-LABEL: sabal8h:
913 ; CHECK-NEXT: ldr d1, [x0]
914 ; CHECK-NEXT: ldr d2, [x1]
915 ; CHECK-NEXT: ldr q0, [x2]
916 ; CHECK-NEXT: sabal.8h v0, v1, v2
918 %tmp1 = load <8 x i8>, <8 x i8>* %A
919 %tmp2 = load <8 x i8>, <8 x i8>* %B
920 %tmp3 = load <8 x i16>, <8 x i16>* %C
921 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
922 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
923 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
927 ; FALLBACK-NOT: remark:{{.*}} sabal4s
928 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
929 ; CHECK-LABEL: sabal4s:
931 ; CHECK-NEXT: ldr d1, [x0]
932 ; CHECK-NEXT: ldr d2, [x1]
933 ; CHECK-NEXT: ldr q0, [x2]
934 ; CHECK-NEXT: sabal.4s v0, v1, v2
936 %tmp1 = load <4 x i16>, <4 x i16>* %A
937 %tmp2 = load <4 x i16>, <4 x i16>* %B
938 %tmp3 = load <4 x i32>, <4 x i32>* %C
939 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
940 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
941 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
945 ; FALLBACK-NOT: remark:{{.*}} sabal2d
946 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
947 ; CHECK-LABEL: sabal2d:
949 ; CHECK-NEXT: ldr d1, [x0]
950 ; CHECK-NEXT: ldr d2, [x1]
951 ; CHECK-NEXT: ldr q0, [x2]
952 ; CHECK-NEXT: sabal.2d v0, v1, v2
954 %tmp1 = load <2 x i32>, <2 x i32>* %A
955 %tmp2 = load <2 x i32>, <2 x i32>* %B
956 %tmp3 = load <2 x i64>, <2 x i64>* %C
957 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
958 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
959 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
960 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
964 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
965 ; CHECK-LABEL: sabal2_8h:
967 ; CHECK-NEXT: ldr q0, [x2]
968 ; CHECK-NEXT: ldr d1, [x0, #8]
969 ; CHECK-NEXT: ldr d2, [x1, #8]
970 ; CHECK-NEXT: sabal.8h v0, v1, v2
972 %load1 = load <16 x i8>, <16 x i8>* %A
973 %load2 = load <16 x i8>, <16 x i8>* %B
974 %tmp3 = load <8 x i16>, <8 x i16>* %C
975 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
976 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
977 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
978 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
979 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
983 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
984 ; CHECK-LABEL: sabal2_4s:
986 ; CHECK-NEXT: ldr q0, [x2]
987 ; CHECK-NEXT: ldr d1, [x0, #8]
988 ; CHECK-NEXT: ldr d2, [x1, #8]
989 ; CHECK-NEXT: sabal.4s v0, v1, v2
991 %load1 = load <8 x i16>, <8 x i16>* %A
992 %load2 = load <8 x i16>, <8 x i16>* %B
993 %tmp3 = load <4 x i32>, <4 x i32>* %C
994 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
995 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
996 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
997 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
998 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1002 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1003 ; CHECK-LABEL: sabal2_2d:
1005 ; CHECK-NEXT: ldr q0, [x2]
1006 ; CHECK-NEXT: ldr d1, [x0, #8]
1007 ; CHECK-NEXT: ldr d2, [x1, #8]
1008 ; CHECK-NEXT: sabal.2d v0, v1, v2
1010 %load1 = load <4 x i32>, <4 x i32>* %A
1011 %load2 = load <4 x i32>, <4 x i32>* %B
1012 %tmp3 = load <2 x i64>, <2 x i64>* %C
1013 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1014 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1015 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1016 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1017 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1021 ; FALLBACK-NOT: remark:{{.*}} uabal8h
1022 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
1023 ; CHECK-LABEL: uabal8h:
1025 ; CHECK-NEXT: ldr d1, [x0]
1026 ; CHECK-NEXT: ldr d2, [x1]
1027 ; CHECK-NEXT: ldr q0, [x2]
1028 ; CHECK-NEXT: uabal.8h v0, v1, v2
1030 %tmp1 = load <8 x i8>, <8 x i8>* %A
1031 %tmp2 = load <8 x i8>, <8 x i8>* %B
1032 %tmp3 = load <8 x i16>, <8 x i16>* %C
1033 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1034 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1035 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1039 ; FALLBACK-NOT: remark:{{.*}} uabal8s
1040 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1041 ; CHECK-LABEL: uabal4s:
1043 ; CHECK-NEXT: ldr d1, [x0]
1044 ; CHECK-NEXT: ldr d2, [x1]
1045 ; CHECK-NEXT: ldr q0, [x2]
1046 ; CHECK-NEXT: uabal.4s v0, v1, v2
1048 %tmp1 = load <4 x i16>, <4 x i16>* %A
1049 %tmp2 = load <4 x i16>, <4 x i16>* %B
1050 %tmp3 = load <4 x i32>, <4 x i32>* %C
1051 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1052 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1053 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1057 ; FALLBACK-NOT: remark:{{.*}} uabal2d
1058 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1059 ; CHECK-LABEL: uabal2d:
1061 ; CHECK-NEXT: ldr d1, [x0]
1062 ; CHECK-NEXT: ldr d2, [x1]
1063 ; CHECK-NEXT: ldr q0, [x2]
1064 ; CHECK-NEXT: uabal.2d v0, v1, v2
1066 %tmp1 = load <2 x i32>, <2 x i32>* %A
1067 %tmp2 = load <2 x i32>, <2 x i32>* %B
1068 %tmp3 = load <2 x i64>, <2 x i64>* %C
1069 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1070 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1071 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1075 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
1076 ; CHECK-LABEL: uabal2_8h:
1078 ; CHECK-NEXT: ldr q0, [x2]
1079 ; CHECK-NEXT: ldr d1, [x0, #8]
1080 ; CHECK-NEXT: ldr d2, [x1, #8]
1081 ; CHECK-NEXT: uabal.8h v0, v1, v2
1083 %load1 = load <16 x i8>, <16 x i8>* %A
1084 %load2 = load <16 x i8>, <16 x i8>* %B
1085 %tmp3 = load <8 x i16>, <8 x i16>* %C
1086 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1087 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1088 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1089 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1090 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1094 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1095 ; CHECK-LABEL: uabal2_4s:
1097 ; CHECK-NEXT: ldr q0, [x2]
1098 ; CHECK-NEXT: ldr d1, [x0, #8]
1099 ; CHECK-NEXT: ldr d2, [x1, #8]
1100 ; CHECK-NEXT: uabal.4s v0, v1, v2
1102 %load1 = load <8 x i16>, <8 x i16>* %A
1103 %load2 = load <8 x i16>, <8 x i16>* %B
1104 %tmp3 = load <4 x i32>, <4 x i32>* %C
1105 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1106 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1107 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1108 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1109 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1113 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1114 ; CHECK-LABEL: uabal2_2d:
1116 ; CHECK-NEXT: ldr q0, [x2]
1117 ; CHECK-NEXT: ldr d1, [x0, #8]
1118 ; CHECK-NEXT: ldr d2, [x1, #8]
1119 ; CHECK-NEXT: uabal.2d v0, v1, v2
1121 %load1 = load <4 x i32>, <4 x i32>* %A
1122 %load2 = load <4 x i32>, <4 x i32>* %B
1123 %tmp3 = load <2 x i64>, <2 x i64>* %C
1124 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1125 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1126 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1127 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1128 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1132 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
1133 ; CHECK-LABEL: saba_8b:
1135 ; CHECK-NEXT: ldr d1, [x0]
1136 ; CHECK-NEXT: ldr d2, [x1]
1137 ; CHECK-NEXT: ldr d0, [x2]
1138 ; CHECK-NEXT: saba.8b v0, v1, v2
1140 %tmp1 = load <8 x i8>, <8 x i8>* %A
1141 %tmp2 = load <8 x i8>, <8 x i8>* %B
1142 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1143 %tmp4 = load <8 x i8>, <8 x i8>* %C
1144 %tmp5 = add <8 x i8> %tmp3, %tmp4
1148 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
1149 ; CHECK-LABEL: saba_16b:
1151 ; CHECK-NEXT: ldr q1, [x0]
1152 ; CHECK-NEXT: ldr q2, [x1]
1153 ; CHECK-NEXT: ldr q0, [x2]
1154 ; CHECK-NEXT: saba.16b v0, v1, v2
1156 %tmp1 = load <16 x i8>, <16 x i8>* %A
1157 %tmp2 = load <16 x i8>, <16 x i8>* %B
1158 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1159 %tmp4 = load <16 x i8>, <16 x i8>* %C
1160 %tmp5 = add <16 x i8> %tmp3, %tmp4
1164 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
1165 ; CHECK-LABEL: saba_4h:
1167 ; CHECK-NEXT: ldr d1, [x0]
1168 ; CHECK-NEXT: ldr d2, [x1]
1169 ; CHECK-NEXT: ldr d0, [x2]
1170 ; CHECK-NEXT: saba.4h v0, v1, v2
1172 %tmp1 = load <4 x i16>, <4 x i16>* %A
1173 %tmp2 = load <4 x i16>, <4 x i16>* %B
1174 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1175 %tmp4 = load <4 x i16>, <4 x i16>* %C
1176 %tmp5 = add <4 x i16> %tmp3, %tmp4
1180 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
1181 ; CHECK-LABEL: saba_8h:
1183 ; CHECK-NEXT: ldr q1, [x0]
1184 ; CHECK-NEXT: ldr q2, [x1]
1185 ; CHECK-NEXT: ldr q0, [x2]
1186 ; CHECK-NEXT: saba.8h v0, v1, v2
1188 %tmp1 = load <8 x i16>, <8 x i16>* %A
1189 %tmp2 = load <8 x i16>, <8 x i16>* %B
1190 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1191 %tmp4 = load <8 x i16>, <8 x i16>* %C
1192 %tmp5 = add <8 x i16> %tmp3, %tmp4
1196 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
1197 ; CHECK-LABEL: saba_2s:
1199 ; CHECK-NEXT: ldr d1, [x0]
1200 ; CHECK-NEXT: ldr d2, [x1]
1201 ; CHECK-NEXT: ldr d0, [x2]
1202 ; CHECK-NEXT: saba.2s v0, v1, v2
1204 %tmp1 = load <2 x i32>, <2 x i32>* %A
1205 %tmp2 = load <2 x i32>, <2 x i32>* %B
1206 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1207 %tmp4 = load <2 x i32>, <2 x i32>* %C
1208 %tmp5 = add <2 x i32> %tmp3, %tmp4
1212 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
1213 ; CHECK-LABEL: saba_4s:
1215 ; CHECK-NEXT: ldr q1, [x0]
1216 ; CHECK-NEXT: ldr q2, [x1]
1217 ; CHECK-NEXT: ldr q0, [x2]
1218 ; CHECK-NEXT: saba.4s v0, v1, v2
1220 %tmp1 = load <4 x i32>, <4 x i32>* %A
1221 %tmp2 = load <4 x i32>, <4 x i32>* %B
1222 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1223 %tmp4 = load <4 x i32>, <4 x i32>* %C
1224 %tmp5 = add <4 x i32> %tmp3, %tmp4
1228 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
1229 ; CHECK-LABEL: uaba_8b:
1231 ; CHECK-NEXT: ldr d1, [x0]
1232 ; CHECK-NEXT: ldr d2, [x1]
1233 ; CHECK-NEXT: ldr d0, [x2]
1234 ; CHECK-NEXT: uaba.8b v0, v1, v2
1236 %tmp1 = load <8 x i8>, <8 x i8>* %A
1237 %tmp2 = load <8 x i8>, <8 x i8>* %B
1238 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1239 %tmp4 = load <8 x i8>, <8 x i8>* %C
1240 %tmp5 = add <8 x i8> %tmp3, %tmp4
1244 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
1245 ; CHECK-LABEL: uaba_16b:
1247 ; CHECK-NEXT: ldr q1, [x0]
1248 ; CHECK-NEXT: ldr q2, [x1]
1249 ; CHECK-NEXT: ldr q0, [x2]
1250 ; CHECK-NEXT: uaba.16b v0, v1, v2
1252 %tmp1 = load <16 x i8>, <16 x i8>* %A
1253 %tmp2 = load <16 x i8>, <16 x i8>* %B
1254 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1255 %tmp4 = load <16 x i8>, <16 x i8>* %C
1256 %tmp5 = add <16 x i8> %tmp3, %tmp4
1260 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
1261 ; CHECK-LABEL: uaba_4h:
1263 ; CHECK-NEXT: ldr d1, [x0]
1264 ; CHECK-NEXT: ldr d2, [x1]
1265 ; CHECK-NEXT: ldr d0, [x2]
1266 ; CHECK-NEXT: uaba.4h v0, v1, v2
1268 %tmp1 = load <4 x i16>, <4 x i16>* %A
1269 %tmp2 = load <4 x i16>, <4 x i16>* %B
1270 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1271 %tmp4 = load <4 x i16>, <4 x i16>* %C
1272 %tmp5 = add <4 x i16> %tmp3, %tmp4
1276 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
1277 ; CHECK-LABEL: uaba_8h:
1279 ; CHECK-NEXT: ldr q1, [x0]
1280 ; CHECK-NEXT: ldr q2, [x1]
1281 ; CHECK-NEXT: ldr q0, [x2]
1282 ; CHECK-NEXT: uaba.8h v0, v1, v2
1284 %tmp1 = load <8 x i16>, <8 x i16>* %A
1285 %tmp2 = load <8 x i16>, <8 x i16>* %B
1286 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1287 %tmp4 = load <8 x i16>, <8 x i16>* %C
1288 %tmp5 = add <8 x i16> %tmp3, %tmp4
1292 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
1293 ; CHECK-LABEL: uaba_2s:
1295 ; CHECK-NEXT: ldr d1, [x0]
1296 ; CHECK-NEXT: ldr d2, [x1]
1297 ; CHECK-NEXT: ldr d0, [x2]
1298 ; CHECK-NEXT: uaba.2s v0, v1, v2
1300 %tmp1 = load <2 x i32>, <2 x i32>* %A
1301 %tmp2 = load <2 x i32>, <2 x i32>* %B
1302 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1303 %tmp4 = load <2 x i32>, <2 x i32>* %C
1304 %tmp5 = add <2 x i32> %tmp3, %tmp4
1308 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
1309 ; CHECK-LABEL: uaba_4s:
1311 ; CHECK-NEXT: ldr q1, [x0]
1312 ; CHECK-NEXT: ldr q2, [x1]
1313 ; CHECK-NEXT: ldr q0, [x2]
1314 ; CHECK-NEXT: uaba.4s v0, v1, v2
1316 %tmp1 = load <4 x i32>, <4 x i32>* %A
1317 %tmp2 = load <4 x i32>, <4 x i32>* %B
1318 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1319 %tmp4 = load <4 x i32>, <4 x i32>* %C
1320 %tmp5 = add <4 x i32> %tmp3, %tmp4
1325 define float @fabds(float %a, float %b) nounwind {
1326 ; CHECK-LABEL: fabds:
1328 ; CHECK-NEXT: fabd s0, s0, s1
1330 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
1334 define double @fabdd(double %a, double %b) nounwind {
1335 ; CHECK-LABEL: fabdd:
1337 ; CHECK-NEXT: fabd d0, d0, d1
1339 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
1343 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
1344 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
1346 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
1347 ; CHECK-LABEL: fabds_from_fsub_fabs:
1349 ; CHECK-NEXT: fabd s0, s0, s1
1351 %sub = fsub float %a, %b
1352 %abs = tail call float @llvm.fabs.f32(float %sub)
1356 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
1357 ; CHECK-LABEL: fabdd_from_fsub_fabs:
1359 ; CHECK-NEXT: fabd d0, d0, d1
1361 %sub = fsub double %a, %b
1362 %abs = tail call double @llvm.fabs.f64(double %sub)
1366 declare float @llvm.fabs.f32(float) nounwind readnone
1367 declare double @llvm.fabs.f64(double) nounwind readnone
1369 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1370 ; CHECK-LABEL: uabdl_from_extract_dup:
1372 ; CHECK-NEXT: dup.2s v1, w0
1373 ; CHECK-NEXT: uabdl.2d v0, v0, v1
1375 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1376 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1378 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1380 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1381 %res1 = zext <2 x i32> %res to <2 x i64>
1385 define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1386 ; CHECK-LABEL: uabdl2_from_extract_dup:
1388 ; CHECK-NEXT: dup.4s v1, w0
1389 ; CHECK-NEXT: uabdl2.2d v0, v0, v1
1391 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1392 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1394 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1396 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1397 %res1 = zext <2 x i32> %res to <2 x i64>
1401 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1402 ; CHECK-LABEL: sabdl_from_extract_dup:
1404 ; CHECK-NEXT: dup.2s v1, w0
1405 ; CHECK-NEXT: sabdl.2d v0, v0, v1
1407 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1408 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1410 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1412 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1413 %res1 = zext <2 x i32> %res to <2 x i64>
1417 define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1418 ; CHECK-LABEL: sabdl2_from_extract_dup:
1420 ; CHECK-NEXT: dup.4s v1, w0
1421 ; CHECK-NEXT: sabdl2.2d v0, v0, v1
1423 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1424 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1426 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1428 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1429 %res1 = zext <2 x i32> %res to <2 x i64>
1433 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
1434 ; DAG-LABEL: abspattern1:
1436 ; DAG-NEXT: abs.2s v0, v0
1439 ; GISEL-LABEL: abspattern1:
1441 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1442 ; GISEL-NEXT: cmge.2s v1, v0, v1
1443 ; GISEL-NEXT: shl.2s v1, v1, #31
1444 ; GISEL-NEXT: neg.2s v2, v0
1445 ; GISEL-NEXT: sshr.2s v1, v1, #31
1446 ; GISEL-NEXT: bif.8b v0, v2, v1
1449 %tmp1neg = sub <2 x i32> zeroinitializer, %a
1450 %b = icmp sge <2 x i32> %a, zeroinitializer
1451 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
1455 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
1456 ; DAG-LABEL: abspattern2:
1458 ; DAG-NEXT: abs.4h v0, v0
1461 ; GISEL-LABEL: abspattern2:
1463 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1464 ; GISEL-NEXT: cmgt.4h v1, v0, v1
1465 ; GISEL-NEXT: shl.4h v1, v1, #15
1466 ; GISEL-NEXT: neg.4h v2, v0
1467 ; GISEL-NEXT: sshr.4h v1, v1, #15
1468 ; GISEL-NEXT: bif.8b v0, v2, v1
1470 ; For GlobalISel, this generates terrible code until we can pattern match this to abs.
1472 %tmp1neg = sub <4 x i16> zeroinitializer, %a
1473 %b = icmp sgt <4 x i16> %a, zeroinitializer
1474 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
1478 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
1479 ; DAG-LABEL: abspattern3:
1481 ; DAG-NEXT: abs.8b v0, v0
1484 ; GISEL-LABEL: abspattern3:
1486 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1487 ; GISEL-NEXT: cmgt.8b v1, v1, v0
1488 ; GISEL-NEXT: shl.8b v1, v1, #7
1489 ; GISEL-NEXT: neg.8b v2, v0
1490 ; GISEL-NEXT: sshr.8b v1, v1, #7
1491 ; GISEL-NEXT: bit.8b v0, v2, v1
1494 %tmp1neg = sub <8 x i8> zeroinitializer, %a
1495 %b = icmp slt <8 x i8> %a, zeroinitializer
1496 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
1500 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
1501 ; DAG-LABEL: abspattern4:
1503 ; DAG-NEXT: abs.4s v0, v0
1506 ; GISEL-LABEL: abspattern4:
1508 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1509 ; GISEL-NEXT: cmge.4s v1, v0, v1
1510 ; GISEL-NEXT: shl.4s v1, v1, #31
1511 ; GISEL-NEXT: neg.4s v2, v0
1512 ; GISEL-NEXT: sshr.4s v1, v1, #31
1513 ; GISEL-NEXT: bif.16b v0, v2, v1
1516 %tmp1neg = sub <4 x i32> zeroinitializer, %a
1517 %b = icmp sge <4 x i32> %a, zeroinitializer
1518 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1522 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1523 ; DAG-LABEL: abspattern5:
1525 ; DAG-NEXT: abs.8h v0, v0
1528 ; GISEL-LABEL: abspattern5:
1530 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1531 ; GISEL-NEXT: cmgt.8h v1, v0, v1
1532 ; GISEL-NEXT: shl.8h v1, v1, #15
1533 ; GISEL-NEXT: neg.8h v2, v0
1534 ; GISEL-NEXT: sshr.8h v1, v1, #15
1535 ; GISEL-NEXT: bif.16b v0, v2, v1
1538 %tmp1neg = sub <8 x i16> zeroinitializer, %a
1539 %b = icmp sgt <8 x i16> %a, zeroinitializer
1540 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1544 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1545 ; DAG-LABEL: abspattern6:
1547 ; DAG-NEXT: abs.16b v0, v0
1550 ; GISEL-LABEL: abspattern6:
1552 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1553 ; GISEL-NEXT: cmgt.16b v1, v1, v0
1554 ; GISEL-NEXT: shl.16b v1, v1, #7
1555 ; GISEL-NEXT: neg.16b v2, v0
1556 ; GISEL-NEXT: sshr.16b v1, v1, #7
1557 ; GISEL-NEXT: bit.16b v0, v2, v1
1560 %tmp1neg = sub <16 x i8> zeroinitializer, %a
1561 %b = icmp slt <16 x i8> %a, zeroinitializer
1562 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1566 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1567 ; DAG-LABEL: abspattern7:
1569 ; DAG-NEXT: abs.2d v0, v0
1572 ; GISEL-LABEL: abspattern7:
1574 ; GISEL-NEXT: movi.2d v1, #0000000000000000
1575 ; GISEL-NEXT: cmge.2d v1, v1, v0
1576 ; GISEL-NEXT: shl.2d v1, v1, #63
1577 ; GISEL-NEXT: neg.2d v2, v0
1578 ; GISEL-NEXT: sshr.2d v1, v1, #63
1579 ; GISEL-NEXT: bit.16b v0, v2, v1
1582 %tmp1neg = sub <2 x i64> zeroinitializer, %a
1583 %b = icmp sle <2 x i64> %a, zeroinitializer
1584 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1588 define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
1589 ; DAG-LABEL: uabd_i32:
1591 ; DAG-NEXT: sabdl.2d v0, v0, v1
1594 ; GISEL-LABEL: uabd_i32:
1596 ; GISEL-NEXT: movi.2d v2, #0000000000000000
1597 ; GISEL-NEXT: ssubl.2d v0, v0, v1
1598 ; GISEL-NEXT: cmgt.2d v1, v2, v0
1599 ; GISEL-NEXT: shl.2d v1, v1, #63
1600 ; GISEL-NEXT: neg.2d v2, v0
1601 ; GISEL-NEXT: sshr.2d v1, v1, #63
1602 ; GISEL-NEXT: bit.16b v0, v2, v1
1604 %aext = sext <2 x i32> %a to <2 x i64>
1605 %bext = sext <2 x i32> %b to <2 x i64>
1606 %abdiff = sub nsw <2 x i64> %aext, %bext
1607 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
1608 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
1609 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
1610 ret <2 x i64> %absel
1614 define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
1615 ; CHECK-LABEL: uabd_i64:
1617 ; CHECK-NEXT: fmov x9, d0
1618 ; CHECK-NEXT: fmov x12, d1
1619 ; CHECK-NEXT: asr x10, x9, #63
1620 ; CHECK-NEXT: asr x13, x12, #63
1621 ; CHECK-NEXT: subs x9, x9, x12
1622 ; CHECK-NEXT: mov.d x8, v0[1]
1623 ; CHECK-NEXT: mov.d x11, v1[1]
1624 ; CHECK-NEXT: sbcs x10, x10, x13
1625 ; CHECK-NEXT: asr x12, x8, #63
1626 ; CHECK-NEXT: asr x14, x11, #63
1627 ; CHECK-NEXT: subs x8, x8, x11
1628 ; CHECK-NEXT: sbcs x11, x12, x14
1629 ; CHECK-NEXT: negs x12, x8
1630 ; CHECK-NEXT: ngcs x13, x11
1631 ; CHECK-NEXT: cmp x11, #0
1632 ; CHECK-NEXT: csel x2, x12, x8, lt
1633 ; CHECK-NEXT: csel x3, x13, x11, lt
1634 ; CHECK-NEXT: negs x8, x9
1635 ; CHECK-NEXT: ngcs x11, x10
1636 ; CHECK-NEXT: cmp x10, #0
1637 ; CHECK-NEXT: csel x8, x8, x9, lt
1638 ; CHECK-NEXT: csel x1, x11, x10, lt
1639 ; CHECK-NEXT: fmov d0, x8
1640 ; CHECK-NEXT: mov.d v0[1], x1
1641 ; CHECK-NEXT: fmov x0, d0
1643 %aext = sext <2 x i64> %a to <2 x i128>
1644 %bext = sext <2 x i64> %b to <2 x i128>
1645 %abdiff = sub nsw <2 x i128> %aext, %bext
1646 %abcmp = icmp slt <2 x i128> %abdiff, zeroinitializer
1647 %ababs = sub nsw <2 x i128> zeroinitializer, %abdiff
1648 %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff
1649 ret <2 x i128> %absel