1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
2 ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=arm64-eabi -aarch64-neon-syntax=apple 2>&1 | FileCheck %s --check-prefixes=FALLBACK,CHECK
4 ; FALLBACK-NOT: remark:{{.*}} G_ZEXT
5 ; FALLBACK-NOT: remark:{{.*}} sabdl8h
6 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
9 %tmp1 = load <8 x i8>, <8 x i8>* %A
10 %tmp2 = load <8 x i8>, <8 x i8>* %B
11 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
12 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
16 ; FALLBACK-NOT: remark:{{.*}} sabdl4s
17 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
18 ;CHECK-LABEL: sabdl4s:
20 %tmp1 = load <4 x i16>, <4 x i16>* %A
21 %tmp2 = load <4 x i16>, <4 x i16>* %B
22 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
23 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
27 ; FALLBACK-NOT: remark:{{.*}} sabdl2d
28 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
29 ;CHECK-LABEL: sabdl2d:
31 %tmp1 = load <2 x i32>, <2 x i32>* %A
32 %tmp2 = load <2 x i32>, <2 x i32>* %B
33 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
34 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
38 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
39 ;CHECK-LABEL: sabdl2_8h:
41 %load1 = load <16 x i8>, <16 x i8>* %A
42 %load2 = load <16 x i8>, <16 x i8>* %B
43 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
44 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
45 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
46 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
50 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
51 ;CHECK-LABEL: sabdl2_4s:
53 %load1 = load <8 x i16>, <8 x i16>* %A
54 %load2 = load <8 x i16>, <8 x i16>* %B
55 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
56 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
57 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
58 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
62 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
63 ;CHECK-LABEL: sabdl2_2d:
65 %load1 = load <4 x i32>, <4 x i32>* %A
66 %load2 = load <4 x i32>, <4 x i32>* %B
67 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
68 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
69 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
70 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
74 ; FALLBACK-NOT: remark:{{.*}} uabdl8h)
75 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
76 ;CHECK-LABEL: uabdl8h:
78 %tmp1 = load <8 x i8>, <8 x i8>* %A
79 %tmp2 = load <8 x i8>, <8 x i8>* %B
80 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
81 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
85 ; FALLBACK-NOT: remark:{{.*}} uabdl4s)
86 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
87 ;CHECK-LABEL: uabdl4s:
89 %tmp1 = load <4 x i16>, <4 x i16>* %A
90 %tmp2 = load <4 x i16>, <4 x i16>* %B
91 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
92 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
96 ; FALLBACK-NOT: remark:{{.*}} uabdl2d)
97 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
98 ;CHECK-LABEL: uabdl2d:
100 %tmp1 = load <2 x i32>, <2 x i32>* %A
101 %tmp2 = load <2 x i32>, <2 x i32>* %B
102 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
103 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
107 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
108 ;CHECK-LABEL: uabdl2_8h:
110 %load1 = load <16 x i8>, <16 x i8>* %A
111 %load2 = load <16 x i8>, <16 x i8>* %B
112 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
113 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
115 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
116 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
120 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
121 ;CHECK-LABEL: uabdl2_4s:
123 %load1 = load <8 x i16>, <8 x i16>* %A
124 %load2 = load <8 x i16>, <8 x i16>* %B
125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
128 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
132 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
133 ;CHECK-LABEL: uabdl2_2d:
135 %load1 = load <4 x i32>, <4 x i32>* %A
136 %load2 = load <4 x i32>, <4 x i32>* %B
137 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
138 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
139 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
140 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
144 declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
146 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
147 ; CHECK-LABEL: uabdl8h_rdx
150 %aload = load <16 x i8>, <16 x i8>* %a, align 1
151 %bload = load <16 x i8>, <16 x i8>* %b, align 1
152 %aext = zext <16 x i8> %aload to <16 x i16>
153 %bext = zext <16 x i8> %bload to <16 x i16>
154 %abdiff = sub nsw <16 x i16> %aext, %bext
155 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
156 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
157 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
158 %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel)
162 declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
164 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
165 ; CHECK-LABEL: uabdl4s_rdx
168 %aload = load <8 x i16>, <8 x i16>* %a, align 1
169 %bload = load <8 x i16>, <8 x i16>* %b, align 1
170 %aext = zext <8 x i16> %aload to <8 x i32>
171 %bext = zext <8 x i16> %bload to <8 x i32>
172 %abdiff = sub nsw <8 x i32> %aext, %bext
173 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
174 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
175 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
176 %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel)
180 declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
182 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
186 %aload = load <4 x i32>, <4 x i32>* %a, align 1
187 %bload = load <4 x i32>, <4 x i32>* %b, align 1
188 %aext = zext <4 x i32> %aload to <4 x i64>
189 %bext = zext <4 x i32> %bload to <4 x i64>
190 %abdiff = sub nsw <4 x i64> %aext, %bext
191 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
192 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
193 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
194 %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel)
198 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
199 ;CHECK-LABEL: fabd_2s:
201 %tmp1 = load <2 x float>, <2 x float>* %A
202 %tmp2 = load <2 x float>, <2 x float>* %B
203 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
204 ret <2 x float> %tmp3
207 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
208 ;CHECK-LABEL: fabd_4s:
210 %tmp1 = load <4 x float>, <4 x float>* %A
211 %tmp2 = load <4 x float>, <4 x float>* %B
212 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
213 ret <4 x float> %tmp3
216 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
217 ;CHECK-LABEL: fabd_2d:
219 %tmp1 = load <2 x double>, <2 x double>* %A
220 %tmp2 = load <2 x double>, <2 x double>* %B
221 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
222 ret <2 x double> %tmp3
225 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
226 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
227 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
229 define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
230 ;CHECK-LABEL: fabd_2s_from_fsub_fabs:
232 %tmp1 = load <2 x float>, <2 x float>* %A
233 %tmp2 = load <2 x float>, <2 x float>* %B
234 %sub = fsub <2 x float> %tmp1, %tmp2
235 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
239 define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
240 ;CHECK-LABEL: fabd_4s_from_fsub_fabs:
242 %tmp1 = load <4 x float>, <4 x float>* %A
243 %tmp2 = load <4 x float>, <4 x float>* %B
244 %sub = fsub <4 x float> %tmp1, %tmp2
245 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
249 define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
250 ;CHECK-LABEL: fabd_2d_from_fsub_fabs:
252 %tmp1 = load <2 x double>, <2 x double>* %A
253 %tmp2 = load <2 x double>, <2 x double>* %B
254 %sub = fsub <2 x double> %tmp1, %tmp2
255 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
256 ret <2 x double> %abs
259 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
260 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
261 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
263 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
264 ;CHECK-LABEL: sabd_8b:
266 %tmp1 = load <8 x i8>, <8 x i8>* %A
267 %tmp2 = load <8 x i8>, <8 x i8>* %B
268 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
272 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
273 ;CHECK-LABEL: sabd_16b:
275 %tmp1 = load <16 x i8>, <16 x i8>* %A
276 %tmp2 = load <16 x i8>, <16 x i8>* %B
277 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
281 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
282 ;CHECK-LABEL: sabd_4h:
284 %tmp1 = load <4 x i16>, <4 x i16>* %A
285 %tmp2 = load <4 x i16>, <4 x i16>* %B
286 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
290 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
291 ;CHECK-LABEL: sabd_8h:
293 %tmp1 = load <8 x i16>, <8 x i16>* %A
294 %tmp2 = load <8 x i16>, <8 x i16>* %B
295 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
299 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
300 ;CHECK-LABEL: sabd_2s:
302 %tmp1 = load <2 x i32>, <2 x i32>* %A
303 %tmp2 = load <2 x i32>, <2 x i32>* %B
304 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
308 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
309 ;CHECK-LABEL: sabd_4s:
311 %tmp1 = load <4 x i32>, <4 x i32>* %A
312 %tmp2 = load <4 x i32>, <4 x i32>* %B
313 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
317 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
318 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
319 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
320 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
321 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
322 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
324 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
325 ;CHECK-LABEL: uabd_8b:
327 %tmp1 = load <8 x i8>, <8 x i8>* %A
328 %tmp2 = load <8 x i8>, <8 x i8>* %B
329 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
333 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
334 ;CHECK-LABEL: uabd_16b:
336 %tmp1 = load <16 x i8>, <16 x i8>* %A
337 %tmp2 = load <16 x i8>, <16 x i8>* %B
338 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
342 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
343 ;CHECK-LABEL: uabd_4h:
345 %tmp1 = load <4 x i16>, <4 x i16>* %A
346 %tmp2 = load <4 x i16>, <4 x i16>* %B
347 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
351 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
352 ;CHECK-LABEL: uabd_8h:
354 %tmp1 = load <8 x i16>, <8 x i16>* %A
355 %tmp2 = load <8 x i16>, <8 x i16>* %B
356 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
360 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
361 ;CHECK-LABEL: uabd_2s:
363 %tmp1 = load <2 x i32>, <2 x i32>* %A
364 %tmp2 = load <2 x i32>, <2 x i32>* %B
365 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
369 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
370 ;CHECK-LABEL: uabd_4s:
372 %tmp1 = load <4 x i32>, <4 x i32>* %A
373 %tmp2 = load <4 x i32>, <4 x i32>* %B
374 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
378 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
379 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
380 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
381 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
382 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
383 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
385 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
386 ;CHECK-LABEL: sqabs_8b:
388 %tmp1 = load <8 x i8>, <8 x i8>* %A
389 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
393 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
394 ;CHECK-LABEL: sqabs_16b:
396 %tmp1 = load <16 x i8>, <16 x i8>* %A
397 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
401 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
402 ;CHECK-LABEL: sqabs_4h:
404 %tmp1 = load <4 x i16>, <4 x i16>* %A
405 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
409 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
410 ;CHECK-LABEL: sqabs_8h:
412 %tmp1 = load <8 x i16>, <8 x i16>* %A
413 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
417 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
418 ;CHECK-LABEL: sqabs_2s:
420 %tmp1 = load <2 x i32>, <2 x i32>* %A
421 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
425 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
426 ;CHECK-LABEL: sqabs_4s:
428 %tmp1 = load <4 x i32>, <4 x i32>* %A
429 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
433 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
434 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
435 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
436 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
437 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
438 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
440 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
441 ;CHECK-LABEL: sqneg_8b:
443 %tmp1 = load <8 x i8>, <8 x i8>* %A
444 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
448 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
449 ;CHECK-LABEL: sqneg_16b:
451 %tmp1 = load <16 x i8>, <16 x i8>* %A
452 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
456 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
457 ;CHECK-LABEL: sqneg_4h:
459 %tmp1 = load <4 x i16>, <4 x i16>* %A
460 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
464 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
465 ;CHECK-LABEL: sqneg_8h:
467 %tmp1 = load <8 x i16>, <8 x i16>* %A
468 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
472 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
473 ;CHECK-LABEL: sqneg_2s:
475 %tmp1 = load <2 x i32>, <2 x i32>* %A
476 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
480 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
481 ;CHECK-LABEL: sqneg_4s:
483 %tmp1 = load <4 x i32>, <4 x i32>* %A
484 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
488 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
489 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
490 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
491 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
492 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
493 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
495 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
496 ;CHECK-LABEL: abs_8b:
498 %tmp1 = load <8 x i8>, <8 x i8>* %A
499 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
503 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
504 ;CHECK-LABEL: abs_16b:
506 %tmp1 = load <16 x i8>, <16 x i8>* %A
507 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
511 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
512 ;CHECK-LABEL: abs_4h:
514 %tmp1 = load <4 x i16>, <4 x i16>* %A
515 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
519 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
520 ;CHECK-LABEL: abs_8h:
522 %tmp1 = load <8 x i16>, <8 x i16>* %A
523 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
527 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
528 ;CHECK-LABEL: abs_2s:
530 %tmp1 = load <2 x i32>, <2 x i32>* %A
531 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
535 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
536 ;CHECK-LABEL: abs_4s:
538 %tmp1 = load <4 x i32>, <4 x i32>* %A
539 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
543 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
544 ; CHECK-LABEL: abs_1d:
546 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
550 define i64 @abs_1d_honestly(i64 %A) nounwind {
551 ; CHECK-LABEL: abs_1d_honestly:
553 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
557 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
558 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
559 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
560 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
561 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
562 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
563 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
564 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
566 ; FALLBACK-NOT: remark:{{.*}} sabal8h
567 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
568 ;CHECK-LABEL: sabal8h:
570 %tmp1 = load <8 x i8>, <8 x i8>* %A
571 %tmp2 = load <8 x i8>, <8 x i8>* %B
572 %tmp3 = load <8 x i16>, <8 x i16>* %C
573 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
574 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
575 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
579 ; FALLBACK-NOT: remark:{{.*}} sabal4s
580 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
581 ;CHECK-LABEL: sabal4s:
583 %tmp1 = load <4 x i16>, <4 x i16>* %A
584 %tmp2 = load <4 x i16>, <4 x i16>* %B
585 %tmp3 = load <4 x i32>, <4 x i32>* %C
586 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
587 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
588 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
592 ; FALLBACK-NOT: remark:{{.*}} sabal2d
593 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
594 ;CHECK-LABEL: sabal2d:
596 %tmp1 = load <2 x i32>, <2 x i32>* %A
597 %tmp2 = load <2 x i32>, <2 x i32>* %B
598 %tmp3 = load <2 x i64>, <2 x i64>* %C
599 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
600 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
601 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
602 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
606 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
607 ;CHECK-LABEL: sabal2_8h:
609 %load1 = load <16 x i8>, <16 x i8>* %A
610 %load2 = load <16 x i8>, <16 x i8>* %B
611 %tmp3 = load <8 x i16>, <8 x i16>* %C
612 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
613 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
614 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
615 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
616 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
620 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
621 ;CHECK-LABEL: sabal2_4s:
623 %load1 = load <8 x i16>, <8 x i16>* %A
624 %load2 = load <8 x i16>, <8 x i16>* %B
625 %tmp3 = load <4 x i32>, <4 x i32>* %C
626 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
627 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
628 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
629 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
630 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
634 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
635 ;CHECK-LABEL: sabal2_2d:
637 %load1 = load <4 x i32>, <4 x i32>* %A
638 %load2 = load <4 x i32>, <4 x i32>* %B
639 %tmp3 = load <2 x i64>, <2 x i64>* %C
640 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
641 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
642 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
643 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
644 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
648 ; FALLBACK-NOT: remark:{{.*}} uabal8h
649 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
650 ;CHECK-LABEL: uabal8h:
652 %tmp1 = load <8 x i8>, <8 x i8>* %A
653 %tmp2 = load <8 x i8>, <8 x i8>* %B
654 %tmp3 = load <8 x i16>, <8 x i16>* %C
655 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
656 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
657 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
661 ; FALLBACK-NOT: remark:{{.*}} uabal8s
662 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
663 ;CHECK-LABEL: uabal4s:
665 %tmp1 = load <4 x i16>, <4 x i16>* %A
666 %tmp2 = load <4 x i16>, <4 x i16>* %B
667 %tmp3 = load <4 x i32>, <4 x i32>* %C
668 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
669 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
670 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
674 ; FALLBACK-NOT: remark:{{.*}} uabal2d
675 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
676 ;CHECK-LABEL: uabal2d:
678 %tmp1 = load <2 x i32>, <2 x i32>* %A
679 %tmp2 = load <2 x i32>, <2 x i32>* %B
680 %tmp3 = load <2 x i64>, <2 x i64>* %C
681 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
682 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
683 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
687 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
688 ;CHECK-LABEL: uabal2_8h:
690 %load1 = load <16 x i8>, <16 x i8>* %A
691 %load2 = load <16 x i8>, <16 x i8>* %B
692 %tmp3 = load <8 x i16>, <8 x i16>* %C
693 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
694 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
695 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
696 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
697 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
701 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
702 ;CHECK-LABEL: uabal2_4s:
704 %load1 = load <8 x i16>, <8 x i16>* %A
705 %load2 = load <8 x i16>, <8 x i16>* %B
706 %tmp3 = load <4 x i32>, <4 x i32>* %C
707 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
708 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
709 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
710 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
711 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
715 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
716 ;CHECK-LABEL: uabal2_2d:
718 %load1 = load <4 x i32>, <4 x i32>* %A
719 %load2 = load <4 x i32>, <4 x i32>* %B
720 %tmp3 = load <2 x i64>, <2 x i64>* %C
721 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
722 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
723 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
724 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
725 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
729 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
730 ;CHECK-LABEL: saba_8b:
732 %tmp1 = load <8 x i8>, <8 x i8>* %A
733 %tmp2 = load <8 x i8>, <8 x i8>* %B
734 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
735 %tmp4 = load <8 x i8>, <8 x i8>* %C
736 %tmp5 = add <8 x i8> %tmp3, %tmp4
740 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
741 ;CHECK-LABEL: saba_16b:
743 %tmp1 = load <16 x i8>, <16 x i8>* %A
744 %tmp2 = load <16 x i8>, <16 x i8>* %B
745 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
746 %tmp4 = load <16 x i8>, <16 x i8>* %C
747 %tmp5 = add <16 x i8> %tmp3, %tmp4
751 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
752 ;CHECK-LABEL: saba_4h:
754 %tmp1 = load <4 x i16>, <4 x i16>* %A
755 %tmp2 = load <4 x i16>, <4 x i16>* %B
756 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
757 %tmp4 = load <4 x i16>, <4 x i16>* %C
758 %tmp5 = add <4 x i16> %tmp3, %tmp4
762 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
763 ;CHECK-LABEL: saba_8h:
765 %tmp1 = load <8 x i16>, <8 x i16>* %A
766 %tmp2 = load <8 x i16>, <8 x i16>* %B
767 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
768 %tmp4 = load <8 x i16>, <8 x i16>* %C
769 %tmp5 = add <8 x i16> %tmp3, %tmp4
773 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
774 ;CHECK-LABEL: saba_2s:
776 %tmp1 = load <2 x i32>, <2 x i32>* %A
777 %tmp2 = load <2 x i32>, <2 x i32>* %B
778 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
779 %tmp4 = load <2 x i32>, <2 x i32>* %C
780 %tmp5 = add <2 x i32> %tmp3, %tmp4
784 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
785 ;CHECK-LABEL: saba_4s:
787 %tmp1 = load <4 x i32>, <4 x i32>* %A
788 %tmp2 = load <4 x i32>, <4 x i32>* %B
789 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
790 %tmp4 = load <4 x i32>, <4 x i32>* %C
791 %tmp5 = add <4 x i32> %tmp3, %tmp4
795 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
796 ;CHECK-LABEL: uaba_8b:
798 %tmp1 = load <8 x i8>, <8 x i8>* %A
799 %tmp2 = load <8 x i8>, <8 x i8>* %B
800 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
801 %tmp4 = load <8 x i8>, <8 x i8>* %C
802 %tmp5 = add <8 x i8> %tmp3, %tmp4
806 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
807 ;CHECK-LABEL: uaba_16b:
809 %tmp1 = load <16 x i8>, <16 x i8>* %A
810 %tmp2 = load <16 x i8>, <16 x i8>* %B
811 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
812 %tmp4 = load <16 x i8>, <16 x i8>* %C
813 %tmp5 = add <16 x i8> %tmp3, %tmp4
817 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
818 ;CHECK-LABEL: uaba_4h:
820 %tmp1 = load <4 x i16>, <4 x i16>* %A
821 %tmp2 = load <4 x i16>, <4 x i16>* %B
822 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
823 %tmp4 = load <4 x i16>, <4 x i16>* %C
824 %tmp5 = add <4 x i16> %tmp3, %tmp4
828 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
829 ;CHECK-LABEL: uaba_8h:
831 %tmp1 = load <8 x i16>, <8 x i16>* %A
832 %tmp2 = load <8 x i16>, <8 x i16>* %B
833 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
834 %tmp4 = load <8 x i16>, <8 x i16>* %C
835 %tmp5 = add <8 x i16> %tmp3, %tmp4
839 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
840 ;CHECK-LABEL: uaba_2s:
842 %tmp1 = load <2 x i32>, <2 x i32>* %A
843 %tmp2 = load <2 x i32>, <2 x i32>* %B
844 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
845 %tmp4 = load <2 x i32>, <2 x i32>* %C
846 %tmp5 = add <2 x i32> %tmp3, %tmp4
850 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
851 ;CHECK-LABEL: uaba_4s:
853 %tmp1 = load <4 x i32>, <4 x i32>* %A
854 %tmp2 = load <4 x i32>, <4 x i32>* %B
855 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
856 %tmp4 = load <4 x i32>, <4 x i32>* %C
857 %tmp5 = add <4 x i32> %tmp3, %tmp4
862 define float @fabds(float %a, float %b) nounwind {
863 ; CHECK-LABEL: fabds:
864 ; CHECK: fabd s0, s0, s1
865 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
869 define double @fabdd(double %a, double %b) nounwind {
870 ; CHECK-LABEL: fabdd:
871 ; CHECK: fabd d0, d0, d1
872 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
876 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
877 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
879 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
880 ; CHECK-LABEL: fabds_from_fsub_fabs:
881 ; CHECK: fabd s0, s0, s1
882 %sub = fsub float %a, %b
883 %abs = tail call float @llvm.fabs.f32(float %sub)
887 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
888 ; CHECK-LABEL: fabdd_from_fsub_fabs:
889 ; CHECK: fabd d0, d0, d1
890 %sub = fsub double %a, %b
891 %abs = tail call double @llvm.fabs.f64(double %sub)
895 declare float @llvm.fabs.f32(float) nounwind readnone
896 declare double @llvm.fabs.f64(double) nounwind readnone
898 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
899 ; CHECK-LABEL: uabdl_from_extract_dup:
902 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
903 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
905 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
907 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
908 %res1 = zext <2 x i32> %res to <2 x i64>
912 define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
913 ; CHECK-LABEL: uabdl2_from_extract_dup:
916 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
917 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
919 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
921 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
922 %res1 = zext <2 x i32> %res to <2 x i64>
926 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
927 ; CHECK-LABEL: sabdl_from_extract_dup:
930 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
931 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
933 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
935 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
936 %res1 = zext <2 x i32> %res to <2 x i64>
940 define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
941 ; CHECK-LABEL: sabdl2_from_extract_dup:
944 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
945 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
947 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
949 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
950 %res1 = zext <2 x i32> %res to <2 x i64>
954 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
955 ; CHECK-LABEL: abspattern1:
958 %tmp1neg = sub <2 x i32> zeroinitializer, %a
959 %b = icmp sge <2 x i32> %a, zeroinitializer
960 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
964 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
965 ; CHECK-LABEL: abspattern2:
968 %tmp1neg = sub <4 x i16> zeroinitializer, %a
969 %b = icmp sgt <4 x i16> %a, zeroinitializer
970 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
974 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
975 ; CHECK-LABEL: abspattern3:
978 %tmp1neg = sub <8 x i8> zeroinitializer, %a
979 %b = icmp slt <8 x i8> %a, zeroinitializer
980 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
984 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
985 ; CHECK-LABEL: abspattern4:
988 %tmp1neg = sub <4 x i32> zeroinitializer, %a
989 %b = icmp sge <4 x i32> %a, zeroinitializer
990 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
994 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
995 ; CHECK-LABEL: abspattern5:
998 %tmp1neg = sub <8 x i16> zeroinitializer, %a
999 %b = icmp sgt <8 x i16> %a, zeroinitializer
1000 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1004 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1005 ; CHECK-LABEL: abspattern6:
1008 %tmp1neg = sub <16 x i8> zeroinitializer, %a
1009 %b = icmp slt <16 x i8> %a, zeroinitializer
1010 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1014 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1015 ; CHECK-LABEL: abspattern7:
1018 %tmp1neg = sub <2 x i64> zeroinitializer, %a
1019 %b = icmp sle <2 x i64> %a, zeroinitializer
1020 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a