1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,CHECK-SD %s
3 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 ; CHECK-GI: warning: Instruction selection used fallback path for abs_8b
6 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_16b
7 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_4h
8 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_8h
9 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_2s
10 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_4s
11 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_1d
12 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_1d_honestly
13 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fabds
14 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fabdd
15 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uabd_i64
17 define <8 x i16> @sabdl8h(ptr %A, ptr %B) nounwind {
18 ; CHECK-LABEL: sabdl8h:
20 ; CHECK-NEXT: ldr d0, [x0]
21 ; CHECK-NEXT: ldr d1, [x1]
22 ; CHECK-NEXT: sabdl.8h v0, v0, v1
24 %tmp1 = load <8 x i8>, ptr %A
25 %tmp2 = load <8 x i8>, ptr %B
26 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
27 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
31 define <4 x i32> @sabdl4s(ptr %A, ptr %B) nounwind {
32 ; CHECK-LABEL: sabdl4s:
34 ; CHECK-NEXT: ldr d0, [x0]
35 ; CHECK-NEXT: ldr d1, [x1]
36 ; CHECK-NEXT: sabdl.4s v0, v0, v1
38 %tmp1 = load <4 x i16>, ptr %A
39 %tmp2 = load <4 x i16>, ptr %B
40 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
41 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
45 define <2 x i64> @sabdl2d(ptr %A, ptr %B) nounwind {
46 ; CHECK-LABEL: sabdl2d:
48 ; CHECK-NEXT: ldr d0, [x0]
49 ; CHECK-NEXT: ldr d1, [x1]
50 ; CHECK-NEXT: sabdl.2d v0, v0, v1
52 %tmp1 = load <2 x i32>, ptr %A
53 %tmp2 = load <2 x i32>, ptr %B
54 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
55 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
59 define <8 x i16> @sabdl2_8h(ptr %A, ptr %B) nounwind {
60 ; CHECK-SD-LABEL: sabdl2_8h:
62 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
63 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
64 ; CHECK-SD-NEXT: sabdl.8h v0, v0, v1
67 ; CHECK-GI-LABEL: sabdl2_8h:
69 ; CHECK-GI-NEXT: ldr q0, [x0]
70 ; CHECK-GI-NEXT: ldr q1, [x1]
71 ; CHECK-GI-NEXT: sabdl2.8h v0, v0, v1
73 %load1 = load <16 x i8>, ptr %A
74 %load2 = load <16 x i8>, ptr %B
75 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
76 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
77 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
78 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
82 define <4 x i32> @sabdl2_4s(ptr %A, ptr %B) nounwind {
83 ; CHECK-SD-LABEL: sabdl2_4s:
85 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
86 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
87 ; CHECK-SD-NEXT: sabdl.4s v0, v0, v1
90 ; CHECK-GI-LABEL: sabdl2_4s:
92 ; CHECK-GI-NEXT: ldr q0, [x0]
93 ; CHECK-GI-NEXT: ldr q1, [x1]
94 ; CHECK-GI-NEXT: sabdl2.4s v0, v0, v1
96 %load1 = load <8 x i16>, ptr %A
97 %load2 = load <8 x i16>, ptr %B
98 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
99 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
100 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
101 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
105 define <2 x i64> @sabdl2_2d(ptr %A, ptr %B) nounwind {
106 ; CHECK-SD-LABEL: sabdl2_2d:
107 ; CHECK-SD: // %bb.0:
108 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
109 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
110 ; CHECK-SD-NEXT: sabdl.2d v0, v0, v1
113 ; CHECK-GI-LABEL: sabdl2_2d:
114 ; CHECK-GI: // %bb.0:
115 ; CHECK-GI-NEXT: ldr q0, [x0]
116 ; CHECK-GI-NEXT: ldr q1, [x1]
117 ; CHECK-GI-NEXT: sabdl2.2d v0, v0, v1
119 %load1 = load <4 x i32>, ptr %A
120 %load2 = load <4 x i32>, ptr %B
121 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
122 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
123 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
124 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
128 define <8 x i16> @uabdl8h(ptr %A, ptr %B) nounwind {
129 ; CHECK-LABEL: uabdl8h:
131 ; CHECK-NEXT: ldr d0, [x0]
132 ; CHECK-NEXT: ldr d1, [x1]
133 ; CHECK-NEXT: uabdl.8h v0, v0, v1
135 %tmp1 = load <8 x i8>, ptr %A
136 %tmp2 = load <8 x i8>, ptr %B
137 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
138 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
142 define <4 x i32> @uabdl4s(ptr %A, ptr %B) nounwind {
143 ; CHECK-LABEL: uabdl4s:
145 ; CHECK-NEXT: ldr d0, [x0]
146 ; CHECK-NEXT: ldr d1, [x1]
147 ; CHECK-NEXT: uabdl.4s v0, v0, v1
149 %tmp1 = load <4 x i16>, ptr %A
150 %tmp2 = load <4 x i16>, ptr %B
151 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
152 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
156 define <2 x i64> @uabdl2d(ptr %A, ptr %B) nounwind {
157 ; CHECK-LABEL: uabdl2d:
159 ; CHECK-NEXT: ldr d0, [x0]
160 ; CHECK-NEXT: ldr d1, [x1]
161 ; CHECK-NEXT: uabdl.2d v0, v0, v1
163 %tmp1 = load <2 x i32>, ptr %A
164 %tmp2 = load <2 x i32>, ptr %B
165 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
166 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
170 define <8 x i16> @uabdl2_8h(ptr %A, ptr %B) nounwind {
171 ; CHECK-SD-LABEL: uabdl2_8h:
172 ; CHECK-SD: // %bb.0:
173 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
174 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
175 ; CHECK-SD-NEXT: uabdl.8h v0, v0, v1
178 ; CHECK-GI-LABEL: uabdl2_8h:
179 ; CHECK-GI: // %bb.0:
180 ; CHECK-GI-NEXT: ldr q0, [x0]
181 ; CHECK-GI-NEXT: ldr q1, [x1]
182 ; CHECK-GI-NEXT: uabdl2.8h v0, v0, v1
184 %load1 = load <16 x i8>, ptr %A
185 %load2 = load <16 x i8>, ptr %B
186 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
187 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
189 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
190 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
194 define <4 x i32> @uabdl2_4s(ptr %A, ptr %B) nounwind {
195 ; CHECK-SD-LABEL: uabdl2_4s:
196 ; CHECK-SD: // %bb.0:
197 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
198 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
199 ; CHECK-SD-NEXT: uabdl.4s v0, v0, v1
202 ; CHECK-GI-LABEL: uabdl2_4s:
203 ; CHECK-GI: // %bb.0:
204 ; CHECK-GI-NEXT: ldr q0, [x0]
205 ; CHECK-GI-NEXT: ldr q1, [x1]
206 ; CHECK-GI-NEXT: uabdl2.4s v0, v0, v1
208 %load1 = load <8 x i16>, ptr %A
209 %load2 = load <8 x i16>, ptr %B
210 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
211 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
212 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
213 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
217 define <2 x i64> @uabdl2_2d(ptr %A, ptr %B) nounwind {
218 ; CHECK-SD-LABEL: uabdl2_2d:
219 ; CHECK-SD: // %bb.0:
220 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
221 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
222 ; CHECK-SD-NEXT: uabdl.2d v0, v0, v1
225 ; CHECK-GI-LABEL: uabdl2_2d:
226 ; CHECK-GI: // %bb.0:
227 ; CHECK-GI-NEXT: ldr q0, [x0]
228 ; CHECK-GI-NEXT: ldr q1, [x1]
229 ; CHECK-GI-NEXT: uabdl2.2d v0, v0, v1
231 %load1 = load <4 x i32>, ptr %A
232 %load2 = load <4 x i32>, ptr %B
233 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
234 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
235 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
236 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
240 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
241 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
243 define i16 @uabd16b_rdx(ptr %a, ptr %b) {
244 ; CHECK-SD-LABEL: uabd16b_rdx:
245 ; CHECK-SD: // %bb.0:
246 ; CHECK-SD-NEXT: ldr q0, [x0]
247 ; CHECK-SD-NEXT: ldr q1, [x1]
248 ; CHECK-SD-NEXT: uabd.16b v0, v0, v1
249 ; CHECK-SD-NEXT: uaddlv.16b h0, v0
250 ; CHECK-SD-NEXT: fmov w0, s0
253 ; CHECK-GI-LABEL: uabd16b_rdx:
254 ; CHECK-GI: // %bb.0:
255 ; CHECK-GI-NEXT: ldr q0, [x0]
256 ; CHECK-GI-NEXT: ldr q1, [x1]
257 ; CHECK-GI-NEXT: usubl.8h v2, v0, v1
258 ; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
259 ; CHECK-GI-NEXT: cmlt.8h v1, v2, #0
260 ; CHECK-GI-NEXT: cmlt.8h v3, v0, #0
261 ; CHECK-GI-NEXT: neg.8h v4, v2
262 ; CHECK-GI-NEXT: neg.8h v5, v0
263 ; CHECK-GI-NEXT: bsl.16b v1, v4, v2
264 ; CHECK-GI-NEXT: bit.16b v0, v5, v3
265 ; CHECK-GI-NEXT: add.8h v0, v1, v0
266 ; CHECK-GI-NEXT: addv.8h h0, v0
267 ; CHECK-GI-NEXT: fmov w0, s0
269 %aload = load <16 x i8>, ptr %a, align 1
270 %bload = load <16 x i8>, ptr %b, align 1
271 %aext = zext <16 x i8> %aload to <16 x i16>
272 %bext = zext <16 x i8> %bload to <16 x i16>
273 %abdiff = sub nsw <16 x i16> %aext, %bext
274 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
275 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
276 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
277 %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
281 define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
282 ; CHECK-SD-LABEL: uabd16b_rdx_i32:
283 ; CHECK-SD: // %bb.0:
284 ; CHECK-SD-NEXT: uabdl.8h v2, v0, v1
285 ; CHECK-SD-NEXT: uabal2.8h v2, v0, v1
286 ; CHECK-SD-NEXT: uaddlv.8h s0, v2
287 ; CHECK-SD-NEXT: fmov w0, s0
290 ; CHECK-GI-LABEL: uabd16b_rdx_i32:
291 ; CHECK-GI: // %bb.0:
292 ; CHECK-GI-NEXT: usubl.8h v3, v0, v1
293 ; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
294 ; CHECK-GI-NEXT: usubl2.8h v0, v0, v1
295 ; CHECK-GI-NEXT: sshll.4s v1, v3, #0
296 ; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
297 ; CHECK-GI-NEXT: sshll.4s v5, v0, #0
298 ; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
299 ; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
300 ; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
301 ; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
302 ; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
303 ; CHECK-GI-NEXT: neg.4s v16, v1
304 ; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
305 ; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
306 ; CHECK-GI-NEXT: neg.4s v19, v5
307 ; CHECK-GI-NEXT: bit.16b v1, v16, v2
308 ; CHECK-GI-NEXT: mov.16b v2, v7
309 ; CHECK-GI-NEXT: bif.16b v0, v6, v18
310 ; CHECK-GI-NEXT: bsl.16b v2, v3, v4
311 ; CHECK-GI-NEXT: mov.16b v3, v17
312 ; CHECK-GI-NEXT: bsl.16b v3, v19, v5
313 ; CHECK-GI-NEXT: add.4s v1, v1, v2
314 ; CHECK-GI-NEXT: add.4s v0, v3, v0
315 ; CHECK-GI-NEXT: add.4s v0, v1, v0
316 ; CHECK-GI-NEXT: addv.4s s0, v0
317 ; CHECK-GI-NEXT: fmov w0, s0
319 %aext = zext <16 x i8> %a to <16 x i32>
320 %bext = zext <16 x i8> %b to <16 x i32>
321 %abdiff = sub nsw <16 x i32> %aext, %bext
322 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
323 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
324 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
325 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
329 define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
330 ; CHECK-SD-LABEL: sabd16b_rdx_i32:
331 ; CHECK-SD: // %bb.0:
332 ; CHECK-SD-NEXT: sabdl.8h v2, v0, v1
333 ; CHECK-SD-NEXT: sabal2.8h v2, v0, v1
334 ; CHECK-SD-NEXT: uaddlv.8h s0, v2
335 ; CHECK-SD-NEXT: fmov w0, s0
338 ; CHECK-GI-LABEL: sabd16b_rdx_i32:
339 ; CHECK-GI: // %bb.0:
340 ; CHECK-GI-NEXT: ssubl.8h v3, v0, v1
341 ; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
342 ; CHECK-GI-NEXT: ssubl2.8h v0, v0, v1
343 ; CHECK-GI-NEXT: sshll.4s v1, v3, #0
344 ; CHECK-GI-NEXT: sshll2.4s v4, v3, #0
345 ; CHECK-GI-NEXT: sshll.4s v5, v0, #0
346 ; CHECK-GI-NEXT: sshll2.4s v6, v0, #0
347 ; CHECK-GI-NEXT: ssubw2.4s v3, v2, v3
348 ; CHECK-GI-NEXT: ssubw2.4s v0, v2, v0
349 ; CHECK-GI-NEXT: cmlt.4s v2, v1, #0
350 ; CHECK-GI-NEXT: cmlt.4s v7, v4, #0
351 ; CHECK-GI-NEXT: neg.4s v16, v1
352 ; CHECK-GI-NEXT: cmlt.4s v17, v5, #0
353 ; CHECK-GI-NEXT: cmlt.4s v18, v6, #0
354 ; CHECK-GI-NEXT: neg.4s v19, v5
355 ; CHECK-GI-NEXT: bit.16b v1, v16, v2
356 ; CHECK-GI-NEXT: mov.16b v2, v7
357 ; CHECK-GI-NEXT: bif.16b v0, v6, v18
358 ; CHECK-GI-NEXT: bsl.16b v2, v3, v4
359 ; CHECK-GI-NEXT: mov.16b v3, v17
360 ; CHECK-GI-NEXT: bsl.16b v3, v19, v5
361 ; CHECK-GI-NEXT: add.4s v1, v1, v2
362 ; CHECK-GI-NEXT: add.4s v0, v3, v0
363 ; CHECK-GI-NEXT: add.4s v0, v1, v0
364 ; CHECK-GI-NEXT: addv.4s s0, v0
365 ; CHECK-GI-NEXT: fmov w0, s0
367 %aext = sext <16 x i8> %a to <16 x i32>
368 %bext = sext <16 x i8> %b to <16 x i32>
369 %abdiff = sub nsw <16 x i32> %aext, %bext
370 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
371 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
372 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
373 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
378 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
379 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
381 define i32 @uabd8h_rdx(ptr %a, ptr %b) {
382 ; CHECK-SD-LABEL: uabd8h_rdx:
383 ; CHECK-SD: // %bb.0:
384 ; CHECK-SD-NEXT: ldr q0, [x0]
385 ; CHECK-SD-NEXT: ldr q1, [x1]
386 ; CHECK-SD-NEXT: uabd.8h v0, v0, v1
387 ; CHECK-SD-NEXT: uaddlv.8h s0, v0
388 ; CHECK-SD-NEXT: fmov w0, s0
391 ; CHECK-GI-LABEL: uabd8h_rdx:
392 ; CHECK-GI: // %bb.0:
393 ; CHECK-GI-NEXT: ldr q0, [x0]
394 ; CHECK-GI-NEXT: ldr q1, [x1]
395 ; CHECK-GI-NEXT: usubl.4s v2, v0, v1
396 ; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
397 ; CHECK-GI-NEXT: cmlt.4s v1, v2, #0
398 ; CHECK-GI-NEXT: cmlt.4s v3, v0, #0
399 ; CHECK-GI-NEXT: neg.4s v4, v2
400 ; CHECK-GI-NEXT: neg.4s v5, v0
401 ; CHECK-GI-NEXT: bsl.16b v1, v4, v2
402 ; CHECK-GI-NEXT: bit.16b v0, v5, v3
403 ; CHECK-GI-NEXT: add.4s v0, v1, v0
404 ; CHECK-GI-NEXT: addv.4s s0, v0
405 ; CHECK-GI-NEXT: fmov w0, s0
407 %aload = load <8 x i16>, ptr %a, align 1
408 %bload = load <8 x i16>, ptr %b, align 1
409 %aext = zext <8 x i16> %aload to <8 x i32>
410 %bext = zext <8 x i16> %bload to <8 x i32>
411 %abdiff = sub nsw <8 x i32> %aext, %bext
412 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
413 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
414 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
415 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
419 define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
420 ; CHECK-SD-LABEL: sabd8h_rdx:
421 ; CHECK-SD: // %bb.0:
422 ; CHECK-SD-NEXT: sabd.8h v0, v0, v1
423 ; CHECK-SD-NEXT: uaddlv.8h s0, v0
424 ; CHECK-SD-NEXT: fmov w0, s0
427 ; CHECK-GI-LABEL: sabd8h_rdx:
428 ; CHECK-GI: // %bb.0:
429 ; CHECK-GI-NEXT: ssubl.4s v2, v0, v1
430 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
431 ; CHECK-GI-NEXT: cmlt.4s v1, v2, #0
432 ; CHECK-GI-NEXT: cmlt.4s v3, v0, #0
433 ; CHECK-GI-NEXT: neg.4s v4, v2
434 ; CHECK-GI-NEXT: neg.4s v5, v0
435 ; CHECK-GI-NEXT: bsl.16b v1, v4, v2
436 ; CHECK-GI-NEXT: bit.16b v0, v5, v3
437 ; CHECK-GI-NEXT: add.4s v0, v1, v0
438 ; CHECK-GI-NEXT: addv.4s s0, v0
439 ; CHECK-GI-NEXT: fmov w0, s0
441 %aext = sext <8 x i16> %a to <8 x i32>
442 %bext = sext <8 x i16> %b to <8 x i32>
443 %abdiff = sub nsw <8 x i32> %aext, %bext
444 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
445 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
446 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
447 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
451 define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
452 ; CHECK-SD-LABEL: uabdl4s_rdx_i32:
453 ; CHECK-SD: // %bb.0:
454 ; CHECK-SD-NEXT: uabdl.4s v0, v0, v1
455 ; CHECK-SD-NEXT: addv.4s s0, v0
456 ; CHECK-SD-NEXT: fmov w0, s0
459 ; CHECK-GI-LABEL: uabdl4s_rdx_i32:
460 ; CHECK-GI: // %bb.0:
461 ; CHECK-GI-NEXT: usubl.4s v0, v0, v1
462 ; CHECK-GI-NEXT: cmlt.4s v1, v0, #0
463 ; CHECK-GI-NEXT: neg.4s v2, v0
464 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
465 ; CHECK-GI-NEXT: addv.4s s0, v0
466 ; CHECK-GI-NEXT: fmov w0, s0
468 %aext = zext <4 x i16> %a to <4 x i32>
469 %bext = zext <4 x i16> %b to <4 x i32>
470 %abdiff = sub nsw <4 x i32> %aext, %bext
471 %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer
472 %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff
473 %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff
474 %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel)
478 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
479 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
481 define i64 @uabd4s_rdx(ptr %a, ptr %b, i32 %h) {
482 ; CHECK-SD-LABEL: uabd4s_rdx:
483 ; CHECK-SD: // %bb.0:
484 ; CHECK-SD-NEXT: ldr q0, [x0]
485 ; CHECK-SD-NEXT: ldr q1, [x1]
486 ; CHECK-SD-NEXT: uabd.4s v0, v0, v1
487 ; CHECK-SD-NEXT: uaddlv.4s d0, v0
488 ; CHECK-SD-NEXT: fmov x0, d0
491 ; CHECK-GI-LABEL: uabd4s_rdx:
492 ; CHECK-GI: // %bb.0:
493 ; CHECK-GI-NEXT: ldr q0, [x0]
494 ; CHECK-GI-NEXT: ldr q1, [x1]
495 ; CHECK-GI-NEXT: usubl.2d v2, v0, v1
496 ; CHECK-GI-NEXT: usubl2.2d v0, v0, v1
497 ; CHECK-GI-NEXT: cmlt.2d v1, v2, #0
498 ; CHECK-GI-NEXT: cmlt.2d v3, v0, #0
499 ; CHECK-GI-NEXT: neg.2d v4, v2
500 ; CHECK-GI-NEXT: neg.2d v5, v0
501 ; CHECK-GI-NEXT: bsl.16b v1, v4, v2
502 ; CHECK-GI-NEXT: bit.16b v0, v5, v3
503 ; CHECK-GI-NEXT: add.2d v0, v1, v0
504 ; CHECK-GI-NEXT: addp.2d d0, v0
505 ; CHECK-GI-NEXT: fmov x0, d0
507 %aload = load <4 x i32>, ptr %a, align 1
508 %bload = load <4 x i32>, ptr %b, align 1
509 %aext = zext <4 x i32> %aload to <4 x i64>
510 %bext = zext <4 x i32> %bload to <4 x i64>
511 %abdiff = sub nsw <4 x i64> %aext, %bext
512 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
513 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
514 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
515 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
519 define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
520 ; CHECK-SD-LABEL: sabd4s_rdx:
521 ; CHECK-SD: // %bb.0:
522 ; CHECK-SD-NEXT: sabd.4s v0, v0, v1
523 ; CHECK-SD-NEXT: uaddlv.4s d0, v0
524 ; CHECK-SD-NEXT: fmov x0, d0
527 ; CHECK-GI-LABEL: sabd4s_rdx:
528 ; CHECK-GI: // %bb.0:
529 ; CHECK-GI-NEXT: ssubl.2d v2, v0, v1
530 ; CHECK-GI-NEXT: ssubl2.2d v0, v0, v1
531 ; CHECK-GI-NEXT: cmlt.2d v1, v2, #0
532 ; CHECK-GI-NEXT: cmlt.2d v3, v0, #0
533 ; CHECK-GI-NEXT: neg.2d v4, v2
534 ; CHECK-GI-NEXT: neg.2d v5, v0
535 ; CHECK-GI-NEXT: bsl.16b v1, v4, v2
536 ; CHECK-GI-NEXT: bit.16b v0, v5, v3
537 ; CHECK-GI-NEXT: add.2d v0, v1, v0
538 ; CHECK-GI-NEXT: addp.2d d0, v0
539 ; CHECK-GI-NEXT: fmov x0, d0
541 %aext = sext <4 x i32> %a to <4 x i64>
542 %bext = sext <4 x i32> %b to <4 x i64>
543 %abdiff = sub nsw <4 x i64> %aext, %bext
544 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
545 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
546 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
547 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
551 define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
552 ; CHECK-SD-LABEL: uabdl2d_rdx_i64:
553 ; CHECK-SD: // %bb.0:
554 ; CHECK-SD-NEXT: uabdl.2d v0, v0, v1
555 ; CHECK-SD-NEXT: addp.2d d0, v0
556 ; CHECK-SD-NEXT: fmov x0, d0
559 ; CHECK-GI-LABEL: uabdl2d_rdx_i64:
560 ; CHECK-GI: // %bb.0:
561 ; CHECK-GI-NEXT: usubl.2d v0, v0, v1
562 ; CHECK-GI-NEXT: cmlt.2d v1, v0, #0
563 ; CHECK-GI-NEXT: neg.2d v2, v0
564 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
565 ; CHECK-GI-NEXT: addp.2d d0, v0
566 ; CHECK-GI-NEXT: fmov x0, d0
568 %aext = zext <2 x i32> %a to <2 x i64>
569 %bext = zext <2 x i32> %b to <2 x i64>
570 %abdiff = sub nsw <2 x i64> %aext, %bext
571 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
572 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
573 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
574 %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel)
578 define <2 x float> @fabd_2s(ptr %A, ptr %B) nounwind {
579 ; CHECK-LABEL: fabd_2s:
581 ; CHECK-NEXT: ldr d0, [x0]
582 ; CHECK-NEXT: ldr d1, [x1]
583 ; CHECK-NEXT: fabd.2s v0, v0, v1
585 %tmp1 = load <2 x float>, ptr %A
586 %tmp2 = load <2 x float>, ptr %B
587 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
588 ret <2 x float> %tmp3
591 define <4 x float> @fabd_4s(ptr %A, ptr %B) nounwind {
592 ; CHECK-LABEL: fabd_4s:
594 ; CHECK-NEXT: ldr q0, [x0]
595 ; CHECK-NEXT: ldr q1, [x1]
596 ; CHECK-NEXT: fabd.4s v0, v0, v1
598 %tmp1 = load <4 x float>, ptr %A
599 %tmp2 = load <4 x float>, ptr %B
600 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
601 ret <4 x float> %tmp3
604 define <2 x double> @fabd_2d(ptr %A, ptr %B) nounwind {
605 ; CHECK-LABEL: fabd_2d:
607 ; CHECK-NEXT: ldr q0, [x0]
608 ; CHECK-NEXT: ldr q1, [x1]
609 ; CHECK-NEXT: fabd.2d v0, v0, v1
611 %tmp1 = load <2 x double>, ptr %A
612 %tmp2 = load <2 x double>, ptr %B
613 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
614 ret <2 x double> %tmp3
617 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
618 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
619 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
621 define <2 x float> @fabd_2s_from_fsub_fabs(ptr %A, ptr %B) nounwind {
622 ; CHECK-LABEL: fabd_2s_from_fsub_fabs:
624 ; CHECK-NEXT: ldr d0, [x0]
625 ; CHECK-NEXT: ldr d1, [x1]
626 ; CHECK-NEXT: fabd.2s v0, v0, v1
628 %tmp1 = load <2 x float>, ptr %A
629 %tmp2 = load <2 x float>, ptr %B
630 %sub = fsub <2 x float> %tmp1, %tmp2
631 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
635 define <4 x float> @fabd_4s_from_fsub_fabs(ptr %A, ptr %B) nounwind {
636 ; CHECK-LABEL: fabd_4s_from_fsub_fabs:
638 ; CHECK-NEXT: ldr q0, [x0]
639 ; CHECK-NEXT: ldr q1, [x1]
640 ; CHECK-NEXT: fabd.4s v0, v0, v1
642 %tmp1 = load <4 x float>, ptr %A
643 %tmp2 = load <4 x float>, ptr %B
644 %sub = fsub <4 x float> %tmp1, %tmp2
645 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
649 define <2 x double> @fabd_2d_from_fsub_fabs(ptr %A, ptr %B) nounwind {
650 ; CHECK-LABEL: fabd_2d_from_fsub_fabs:
652 ; CHECK-NEXT: ldr q0, [x0]
653 ; CHECK-NEXT: ldr q1, [x1]
654 ; CHECK-NEXT: fabd.2d v0, v0, v1
656 %tmp1 = load <2 x double>, ptr %A
657 %tmp2 = load <2 x double>, ptr %B
658 %sub = fsub <2 x double> %tmp1, %tmp2
659 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
660 ret <2 x double> %abs
663 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
664 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
665 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
667 define <8 x i8> @sabd_8b(ptr %A, ptr %B) nounwind {
668 ; CHECK-LABEL: sabd_8b:
670 ; CHECK-NEXT: ldr d0, [x0]
671 ; CHECK-NEXT: ldr d1, [x1]
672 ; CHECK-NEXT: sabd.8b v0, v0, v1
674 %tmp1 = load <8 x i8>, ptr %A
675 %tmp2 = load <8 x i8>, ptr %B
676 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
680 define <16 x i8> @sabd_16b(ptr %A, ptr %B) nounwind {
681 ; CHECK-LABEL: sabd_16b:
683 ; CHECK-NEXT: ldr q0, [x0]
684 ; CHECK-NEXT: ldr q1, [x1]
685 ; CHECK-NEXT: sabd.16b v0, v0, v1
687 %tmp1 = load <16 x i8>, ptr %A
688 %tmp2 = load <16 x i8>, ptr %B
689 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
693 define <4 x i16> @sabd_4h(ptr %A, ptr %B) nounwind {
694 ; CHECK-LABEL: sabd_4h:
696 ; CHECK-NEXT: ldr d0, [x0]
697 ; CHECK-NEXT: ldr d1, [x1]
698 ; CHECK-NEXT: sabd.4h v0, v0, v1
700 %tmp1 = load <4 x i16>, ptr %A
701 %tmp2 = load <4 x i16>, ptr %B
702 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
706 define <8 x i16> @sabd_8h(ptr %A, ptr %B) nounwind {
707 ; CHECK-LABEL: sabd_8h:
709 ; CHECK-NEXT: ldr q0, [x0]
710 ; CHECK-NEXT: ldr q1, [x1]
711 ; CHECK-NEXT: sabd.8h v0, v0, v1
713 %tmp1 = load <8 x i16>, ptr %A
714 %tmp2 = load <8 x i16>, ptr %B
715 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
719 define <2 x i32> @sabd_2s(ptr %A, ptr %B) nounwind {
720 ; CHECK-LABEL: sabd_2s:
722 ; CHECK-NEXT: ldr d0, [x0]
723 ; CHECK-NEXT: ldr d1, [x1]
724 ; CHECK-NEXT: sabd.2s v0, v0, v1
726 %tmp1 = load <2 x i32>, ptr %A
727 %tmp2 = load <2 x i32>, ptr %B
728 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
732 define <4 x i32> @sabd_4s(ptr %A, ptr %B) nounwind {
733 ; CHECK-LABEL: sabd_4s:
735 ; CHECK-NEXT: ldr q0, [x0]
736 ; CHECK-NEXT: ldr q1, [x1]
737 ; CHECK-NEXT: sabd.4s v0, v0, v1
739 %tmp1 = load <4 x i32>, ptr %A
740 %tmp2 = load <4 x i32>, ptr %B
741 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
745 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
746 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
747 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
748 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
749 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
750 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
752 define <8 x i8> @uabd_8b(ptr %A, ptr %B) nounwind {
753 ; CHECK-LABEL: uabd_8b:
755 ; CHECK-NEXT: ldr d0, [x0]
756 ; CHECK-NEXT: ldr d1, [x1]
757 ; CHECK-NEXT: uabd.8b v0, v0, v1
759 %tmp1 = load <8 x i8>, ptr %A
760 %tmp2 = load <8 x i8>, ptr %B
761 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
765 define <16 x i8> @uabd_16b(ptr %A, ptr %B) nounwind {
766 ; CHECK-LABEL: uabd_16b:
768 ; CHECK-NEXT: ldr q0, [x0]
769 ; CHECK-NEXT: ldr q1, [x1]
770 ; CHECK-NEXT: uabd.16b v0, v0, v1
772 %tmp1 = load <16 x i8>, ptr %A
773 %tmp2 = load <16 x i8>, ptr %B
774 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
778 define <4 x i16> @uabd_4h(ptr %A, ptr %B) nounwind {
779 ; CHECK-LABEL: uabd_4h:
781 ; CHECK-NEXT: ldr d0, [x0]
782 ; CHECK-NEXT: ldr d1, [x1]
783 ; CHECK-NEXT: uabd.4h v0, v0, v1
785 %tmp1 = load <4 x i16>, ptr %A
786 %tmp2 = load <4 x i16>, ptr %B
787 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
791 define <8 x i16> @uabd_8h(ptr %A, ptr %B) nounwind {
792 ; CHECK-LABEL: uabd_8h:
794 ; CHECK-NEXT: ldr q0, [x0]
795 ; CHECK-NEXT: ldr q1, [x1]
796 ; CHECK-NEXT: uabd.8h v0, v0, v1
798 %tmp1 = load <8 x i16>, ptr %A
799 %tmp2 = load <8 x i16>, ptr %B
800 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
804 define <2 x i32> @uabd_2s(ptr %A, ptr %B) nounwind {
805 ; CHECK-LABEL: uabd_2s:
807 ; CHECK-NEXT: ldr d0, [x0]
808 ; CHECK-NEXT: ldr d1, [x1]
809 ; CHECK-NEXT: uabd.2s v0, v0, v1
811 %tmp1 = load <2 x i32>, ptr %A
812 %tmp2 = load <2 x i32>, ptr %B
813 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
817 define <4 x i32> @uabd_4s(ptr %A, ptr %B) nounwind {
818 ; CHECK-LABEL: uabd_4s:
820 ; CHECK-NEXT: ldr q0, [x0]
821 ; CHECK-NEXT: ldr q1, [x1]
822 ; CHECK-NEXT: uabd.4s v0, v0, v1
824 %tmp1 = load <4 x i32>, ptr %A
825 %tmp2 = load <4 x i32>, ptr %B
826 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
830 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
831 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
832 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
833 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
834 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
835 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
837 define <8 x i8> @sqabs_8b(ptr %A) nounwind {
838 ; CHECK-LABEL: sqabs_8b:
840 ; CHECK-NEXT: ldr d0, [x0]
841 ; CHECK-NEXT: sqabs.8b v0, v0
843 %tmp1 = load <8 x i8>, ptr %A
844 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
848 define <16 x i8> @sqabs_16b(ptr %A) nounwind {
849 ; CHECK-LABEL: sqabs_16b:
851 ; CHECK-NEXT: ldr q0, [x0]
852 ; CHECK-NEXT: sqabs.16b v0, v0
854 %tmp1 = load <16 x i8>, ptr %A
855 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
859 define <4 x i16> @sqabs_4h(ptr %A) nounwind {
860 ; CHECK-LABEL: sqabs_4h:
862 ; CHECK-NEXT: ldr d0, [x0]
863 ; CHECK-NEXT: sqabs.4h v0, v0
865 %tmp1 = load <4 x i16>, ptr %A
866 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
870 define <8 x i16> @sqabs_8h(ptr %A) nounwind {
871 ; CHECK-LABEL: sqabs_8h:
873 ; CHECK-NEXT: ldr q0, [x0]
874 ; CHECK-NEXT: sqabs.8h v0, v0
876 %tmp1 = load <8 x i16>, ptr %A
877 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
881 define <2 x i32> @sqabs_2s(ptr %A) nounwind {
882 ; CHECK-LABEL: sqabs_2s:
884 ; CHECK-NEXT: ldr d0, [x0]
885 ; CHECK-NEXT: sqabs.2s v0, v0
887 %tmp1 = load <2 x i32>, ptr %A
888 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
892 define <4 x i32> @sqabs_4s(ptr %A) nounwind {
893 ; CHECK-LABEL: sqabs_4s:
895 ; CHECK-NEXT: ldr q0, [x0]
896 ; CHECK-NEXT: sqabs.4s v0, v0
898 %tmp1 = load <4 x i32>, ptr %A
899 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
903 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
904 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
905 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
906 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
907 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
908 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
910 define <8 x i8> @sqneg_8b(ptr %A) nounwind {
911 ; CHECK-LABEL: sqneg_8b:
913 ; CHECK-NEXT: ldr d0, [x0]
914 ; CHECK-NEXT: sqneg.8b v0, v0
916 %tmp1 = load <8 x i8>, ptr %A
917 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
921 define <16 x i8> @sqneg_16b(ptr %A) nounwind {
922 ; CHECK-LABEL: sqneg_16b:
924 ; CHECK-NEXT: ldr q0, [x0]
925 ; CHECK-NEXT: sqneg.16b v0, v0
927 %tmp1 = load <16 x i8>, ptr %A
928 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
932 define <4 x i16> @sqneg_4h(ptr %A) nounwind {
933 ; CHECK-LABEL: sqneg_4h:
935 ; CHECK-NEXT: ldr d0, [x0]
936 ; CHECK-NEXT: sqneg.4h v0, v0
938 %tmp1 = load <4 x i16>, ptr %A
939 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
943 define <8 x i16> @sqneg_8h(ptr %A) nounwind {
944 ; CHECK-LABEL: sqneg_8h:
946 ; CHECK-NEXT: ldr q0, [x0]
947 ; CHECK-NEXT: sqneg.8h v0, v0
949 %tmp1 = load <8 x i16>, ptr %A
950 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
954 define <2 x i32> @sqneg_2s(ptr %A) nounwind {
955 ; CHECK-LABEL: sqneg_2s:
957 ; CHECK-NEXT: ldr d0, [x0]
958 ; CHECK-NEXT: sqneg.2s v0, v0
960 %tmp1 = load <2 x i32>, ptr %A
961 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
965 define <4 x i32> @sqneg_4s(ptr %A) nounwind {
966 ; CHECK-LABEL: sqneg_4s:
968 ; CHECK-NEXT: ldr q0, [x0]
969 ; CHECK-NEXT: sqneg.4s v0, v0
971 %tmp1 = load <4 x i32>, ptr %A
972 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
976 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
977 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
978 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
979 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
980 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
981 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
983 define <8 x i8> @abs_8b(ptr %A) nounwind {
984 ; CHECK-LABEL: abs_8b:
986 ; CHECK-NEXT: ldr d0, [x0]
987 ; CHECK-NEXT: abs.8b v0, v0
989 %tmp1 = load <8 x i8>, ptr %A
990 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
994 define <16 x i8> @abs_16b(ptr %A) nounwind {
995 ; CHECK-LABEL: abs_16b:
997 ; CHECK-NEXT: ldr q0, [x0]
998 ; CHECK-NEXT: abs.16b v0, v0
1000 %tmp1 = load <16 x i8>, ptr %A
1001 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
1005 define <4 x i16> @abs_4h(ptr %A) nounwind {
1006 ; CHECK-LABEL: abs_4h:
1008 ; CHECK-NEXT: ldr d0, [x0]
1009 ; CHECK-NEXT: abs.4h v0, v0
1011 %tmp1 = load <4 x i16>, ptr %A
1012 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
1016 define <8 x i16> @abs_8h(ptr %A) nounwind {
1017 ; CHECK-LABEL: abs_8h:
1019 ; CHECK-NEXT: ldr q0, [x0]
1020 ; CHECK-NEXT: abs.8h v0, v0
1022 %tmp1 = load <8 x i16>, ptr %A
1023 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
1027 define <2 x i32> @abs_2s(ptr %A) nounwind {
1028 ; CHECK-LABEL: abs_2s:
1030 ; CHECK-NEXT: ldr d0, [x0]
1031 ; CHECK-NEXT: abs.2s v0, v0
1033 %tmp1 = load <2 x i32>, ptr %A
1034 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
1038 define <4 x i32> @abs_4s(ptr %A) nounwind {
1039 ; CHECK-LABEL: abs_4s:
1041 ; CHECK-NEXT: ldr q0, [x0]
1042 ; CHECK-NEXT: abs.4s v0, v0
1044 %tmp1 = load <4 x i32>, ptr %A
1045 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
1049 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
1050 ; CHECK-LABEL: abs_1d:
1052 ; CHECK-NEXT: abs d0, d0
1054 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
1058 define i64 @abs_1d_honestly(i64 %A) nounwind {
1059 ; CHECK-LABEL: abs_1d_honestly:
1061 ; CHECK-NEXT: fmov d0, x0
1062 ; CHECK-NEXT: abs d0, d0
1063 ; CHECK-NEXT: fmov x0, d0
1065 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
1069 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
1070 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
1071 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
1072 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
1073 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
1074 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
1075 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
1076 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
1078 define <8 x i16> @sabal8h(ptr %A, ptr %B, ptr %C) nounwind {
1079 ; CHECK-LABEL: sabal8h:
1081 ; CHECK-NEXT: ldr d1, [x0]
1082 ; CHECK-NEXT: ldr d2, [x1]
1083 ; CHECK-NEXT: ldr q0, [x2]
1084 ; CHECK-NEXT: sabal.8h v0, v1, v2
1086 %tmp1 = load <8 x i8>, ptr %A
1087 %tmp2 = load <8 x i8>, ptr %B
1088 %tmp3 = load <8 x i16>, ptr %C
1089 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1090 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1091 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1095 define <4 x i32> @sabal4s(ptr %A, ptr %B, ptr %C) nounwind {
1096 ; CHECK-LABEL: sabal4s:
1098 ; CHECK-NEXT: ldr d1, [x0]
1099 ; CHECK-NEXT: ldr d2, [x1]
1100 ; CHECK-NEXT: ldr q0, [x2]
1101 ; CHECK-NEXT: sabal.4s v0, v1, v2
1103 %tmp1 = load <4 x i16>, ptr %A
1104 %tmp2 = load <4 x i16>, ptr %B
1105 %tmp3 = load <4 x i32>, ptr %C
1106 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1107 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1108 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1112 define <2 x i64> @sabal2d(ptr %A, ptr %B, ptr %C) nounwind {
1113 ; CHECK-LABEL: sabal2d:
1115 ; CHECK-NEXT: ldr d1, [x0]
1116 ; CHECK-NEXT: ldr d2, [x1]
1117 ; CHECK-NEXT: ldr q0, [x2]
1118 ; CHECK-NEXT: sabal.2d v0, v1, v2
1120 %tmp1 = load <2 x i32>, ptr %A
1121 %tmp2 = load <2 x i32>, ptr %B
1122 %tmp3 = load <2 x i64>, ptr %C
1123 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1124 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1125 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
1126 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1130 define <8 x i16> @sabal2_8h(ptr %A, ptr %B, ptr %C) nounwind {
1131 ; CHECK-SD-LABEL: sabal2_8h:
1132 ; CHECK-SD: // %bb.0:
1133 ; CHECK-SD-NEXT: ldr q0, [x2]
1134 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1135 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1136 ; CHECK-SD-NEXT: sabal.8h v0, v1, v2
1137 ; CHECK-SD-NEXT: ret
1139 ; CHECK-GI-LABEL: sabal2_8h:
1140 ; CHECK-GI: // %bb.0:
1141 ; CHECK-GI-NEXT: ldr q1, [x0]
1142 ; CHECK-GI-NEXT: ldr q2, [x1]
1143 ; CHECK-GI-NEXT: ldr q0, [x2]
1144 ; CHECK-GI-NEXT: sabal2.8h v0, v1, v2
1145 ; CHECK-GI-NEXT: ret
1146 %load1 = load <16 x i8>, ptr %A
1147 %load2 = load <16 x i8>, ptr %B
1148 %tmp3 = load <8 x i16>, ptr %C
1149 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1150 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1151 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1152 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1153 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1157 define <4 x i32> @sabal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
1158 ; CHECK-SD-LABEL: sabal2_4s:
1159 ; CHECK-SD: // %bb.0:
1160 ; CHECK-SD-NEXT: ldr q0, [x2]
1161 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1162 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1163 ; CHECK-SD-NEXT: sabal.4s v0, v1, v2
1164 ; CHECK-SD-NEXT: ret
1166 ; CHECK-GI-LABEL: sabal2_4s:
1167 ; CHECK-GI: // %bb.0:
1168 ; CHECK-GI-NEXT: ldr q1, [x0]
1169 ; CHECK-GI-NEXT: ldr q2, [x1]
1170 ; CHECK-GI-NEXT: ldr q0, [x2]
1171 ; CHECK-GI-NEXT: sabal2.4s v0, v1, v2
1172 ; CHECK-GI-NEXT: ret
1173 %load1 = load <8 x i16>, ptr %A
1174 %load2 = load <8 x i16>, ptr %B
1175 %tmp3 = load <4 x i32>, ptr %C
1176 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1177 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1178 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1179 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1180 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1184 define <2 x i64> @sabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
1185 ; CHECK-SD-LABEL: sabal2_2d:
1186 ; CHECK-SD: // %bb.0:
1187 ; CHECK-SD-NEXT: ldr q0, [x2]
1188 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1189 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1190 ; CHECK-SD-NEXT: sabal.2d v0, v1, v2
1191 ; CHECK-SD-NEXT: ret
1193 ; CHECK-GI-LABEL: sabal2_2d:
1194 ; CHECK-GI: // %bb.0:
1195 ; CHECK-GI-NEXT: ldr q1, [x0]
1196 ; CHECK-GI-NEXT: ldr q2, [x1]
1197 ; CHECK-GI-NEXT: ldr q0, [x2]
1198 ; CHECK-GI-NEXT: sabal2.2d v0, v1, v2
1199 ; CHECK-GI-NEXT: ret
1200 %load1 = load <4 x i32>, ptr %A
1201 %load2 = load <4 x i32>, ptr %B
1202 %tmp3 = load <2 x i64>, ptr %C
1203 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1204 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1205 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1206 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1207 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1211 define <8 x i16> @uabal8h(ptr %A, ptr %B, ptr %C) nounwind {
1212 ; CHECK-LABEL: uabal8h:
1214 ; CHECK-NEXT: ldr d1, [x0]
1215 ; CHECK-NEXT: ldr d2, [x1]
1216 ; CHECK-NEXT: ldr q0, [x2]
1217 ; CHECK-NEXT: uabal.8h v0, v1, v2
1219 %tmp1 = load <8 x i8>, ptr %A
1220 %tmp2 = load <8 x i8>, ptr %B
1221 %tmp3 = load <8 x i16>, ptr %C
1222 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1223 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1224 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1228 define <4 x i32> @uabal4s(ptr %A, ptr %B, ptr %C) nounwind {
1229 ; CHECK-LABEL: uabal4s:
1231 ; CHECK-NEXT: ldr d1, [x0]
1232 ; CHECK-NEXT: ldr d2, [x1]
1233 ; CHECK-NEXT: ldr q0, [x2]
1234 ; CHECK-NEXT: uabal.4s v0, v1, v2
1236 %tmp1 = load <4 x i16>, ptr %A
1237 %tmp2 = load <4 x i16>, ptr %B
1238 %tmp3 = load <4 x i32>, ptr %C
1239 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1240 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1241 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1245 define <2 x i64> @uabal2d(ptr %A, ptr %B, ptr %C) nounwind {
1246 ; CHECK-LABEL: uabal2d:
1248 ; CHECK-NEXT: ldr d1, [x0]
1249 ; CHECK-NEXT: ldr d2, [x1]
1250 ; CHECK-NEXT: ldr q0, [x2]
1251 ; CHECK-NEXT: uabal.2d v0, v1, v2
1253 %tmp1 = load <2 x i32>, ptr %A
1254 %tmp2 = load <2 x i32>, ptr %B
1255 %tmp3 = load <2 x i64>, ptr %C
1256 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1257 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1258 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1262 define <8 x i16> @uabal2_8h(ptr %A, ptr %B, ptr %C) nounwind {
1263 ; CHECK-SD-LABEL: uabal2_8h:
1264 ; CHECK-SD: // %bb.0:
1265 ; CHECK-SD-NEXT: ldr q0, [x2]
1266 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1267 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1268 ; CHECK-SD-NEXT: uabal.8h v0, v1, v2
1269 ; CHECK-SD-NEXT: ret
1271 ; CHECK-GI-LABEL: uabal2_8h:
1272 ; CHECK-GI: // %bb.0:
1273 ; CHECK-GI-NEXT: ldr q1, [x0]
1274 ; CHECK-GI-NEXT: ldr q2, [x1]
1275 ; CHECK-GI-NEXT: ldr q0, [x2]
1276 ; CHECK-GI-NEXT: uabal2.8h v0, v1, v2
1277 ; CHECK-GI-NEXT: ret
1278 %load1 = load <16 x i8>, ptr %A
1279 %load2 = load <16 x i8>, ptr %B
1280 %tmp3 = load <8 x i16>, ptr %C
1281 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1282 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1283 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1284 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1285 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1289 define <4 x i32> @uabal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
1290 ; CHECK-SD-LABEL: uabal2_4s:
1291 ; CHECK-SD: // %bb.0:
1292 ; CHECK-SD-NEXT: ldr q0, [x2]
1293 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1294 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1295 ; CHECK-SD-NEXT: uabal.4s v0, v1, v2
1296 ; CHECK-SD-NEXT: ret
1298 ; CHECK-GI-LABEL: uabal2_4s:
1299 ; CHECK-GI: // %bb.0:
1300 ; CHECK-GI-NEXT: ldr q1, [x0]
1301 ; CHECK-GI-NEXT: ldr q2, [x1]
1302 ; CHECK-GI-NEXT: ldr q0, [x2]
1303 ; CHECK-GI-NEXT: uabal2.4s v0, v1, v2
1304 ; CHECK-GI-NEXT: ret
1305 %load1 = load <8 x i16>, ptr %A
1306 %load2 = load <8 x i16>, ptr %B
1307 %tmp3 = load <4 x i32>, ptr %C
1308 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1309 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1310 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1311 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1312 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1316 define <2 x i64> @uabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
1317 ; CHECK-SD-LABEL: uabal2_2d:
1318 ; CHECK-SD: // %bb.0:
1319 ; CHECK-SD-NEXT: ldr q0, [x2]
1320 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1321 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1322 ; CHECK-SD-NEXT: uabal.2d v0, v1, v2
1323 ; CHECK-SD-NEXT: ret
1325 ; CHECK-GI-LABEL: uabal2_2d:
1326 ; CHECK-GI: // %bb.0:
1327 ; CHECK-GI-NEXT: ldr q1, [x0]
1328 ; CHECK-GI-NEXT: ldr q2, [x1]
1329 ; CHECK-GI-NEXT: ldr q0, [x2]
1330 ; CHECK-GI-NEXT: uabal2.2d v0, v1, v2
1331 ; CHECK-GI-NEXT: ret
1332 %load1 = load <4 x i32>, ptr %A
1333 %load2 = load <4 x i32>, ptr %B
1334 %tmp3 = load <2 x i64>, ptr %C
1335 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1336 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1337 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1338 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1339 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1343 define <8 x i8> @saba_8b(ptr %A, ptr %B, ptr %C) nounwind {
1344 ; CHECK-LABEL: saba_8b:
1346 ; CHECK-NEXT: ldr d1, [x0]
1347 ; CHECK-NEXT: ldr d2, [x1]
1348 ; CHECK-NEXT: ldr d0, [x2]
1349 ; CHECK-NEXT: saba.8b v0, v1, v2
1351 %tmp1 = load <8 x i8>, ptr %A
1352 %tmp2 = load <8 x i8>, ptr %B
1353 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1354 %tmp4 = load <8 x i8>, ptr %C
1355 %tmp5 = add <8 x i8> %tmp3, %tmp4
1359 define <16 x i8> @saba_16b(ptr %A, ptr %B, ptr %C) nounwind {
1360 ; CHECK-LABEL: saba_16b:
1362 ; CHECK-NEXT: ldr q1, [x0]
1363 ; CHECK-NEXT: ldr q2, [x1]
1364 ; CHECK-NEXT: ldr q0, [x2]
1365 ; CHECK-NEXT: saba.16b v0, v1, v2
1367 %tmp1 = load <16 x i8>, ptr %A
1368 %tmp2 = load <16 x i8>, ptr %B
1369 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1370 %tmp4 = load <16 x i8>, ptr %C
1371 %tmp5 = add <16 x i8> %tmp3, %tmp4
1375 define <4 x i16> @saba_4h(ptr %A, ptr %B, ptr %C) nounwind {
1376 ; CHECK-LABEL: saba_4h:
1378 ; CHECK-NEXT: ldr d1, [x0]
1379 ; CHECK-NEXT: ldr d2, [x1]
1380 ; CHECK-NEXT: ldr d0, [x2]
1381 ; CHECK-NEXT: saba.4h v0, v1, v2
1383 %tmp1 = load <4 x i16>, ptr %A
1384 %tmp2 = load <4 x i16>, ptr %B
1385 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1386 %tmp4 = load <4 x i16>, ptr %C
1387 %tmp5 = add <4 x i16> %tmp3, %tmp4
1391 define <8 x i16> @saba_8h(ptr %A, ptr %B, ptr %C) nounwind {
1392 ; CHECK-LABEL: saba_8h:
1394 ; CHECK-NEXT: ldr q1, [x0]
1395 ; CHECK-NEXT: ldr q2, [x1]
1396 ; CHECK-NEXT: ldr q0, [x2]
1397 ; CHECK-NEXT: saba.8h v0, v1, v2
1399 %tmp1 = load <8 x i16>, ptr %A
1400 %tmp2 = load <8 x i16>, ptr %B
1401 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1402 %tmp4 = load <8 x i16>, ptr %C
1403 %tmp5 = add <8 x i16> %tmp3, %tmp4
1407 define <2 x i32> @saba_2s(ptr %A, ptr %B, ptr %C) nounwind {
1408 ; CHECK-LABEL: saba_2s:
1410 ; CHECK-NEXT: ldr d1, [x0]
1411 ; CHECK-NEXT: ldr d2, [x1]
1412 ; CHECK-NEXT: ldr d0, [x2]
1413 ; CHECK-NEXT: saba.2s v0, v1, v2
1415 %tmp1 = load <2 x i32>, ptr %A
1416 %tmp2 = load <2 x i32>, ptr %B
1417 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1418 %tmp4 = load <2 x i32>, ptr %C
1419 %tmp5 = add <2 x i32> %tmp3, %tmp4
1423 define <4 x i32> @saba_4s(ptr %A, ptr %B, ptr %C) nounwind {
1424 ; CHECK-LABEL: saba_4s:
1426 ; CHECK-NEXT: ldr q1, [x0]
1427 ; CHECK-NEXT: ldr q2, [x1]
1428 ; CHECK-NEXT: ldr q0, [x2]
1429 ; CHECK-NEXT: saba.4s v0, v1, v2
1431 %tmp1 = load <4 x i32>, ptr %A
1432 %tmp2 = load <4 x i32>, ptr %B
1433 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1434 %tmp4 = load <4 x i32>, ptr %C
1435 %tmp5 = add <4 x i32> %tmp3, %tmp4
1439 define <8 x i8> @uaba_8b(ptr %A, ptr %B, ptr %C) nounwind {
1440 ; CHECK-LABEL: uaba_8b:
1442 ; CHECK-NEXT: ldr d1, [x0]
1443 ; CHECK-NEXT: ldr d2, [x1]
1444 ; CHECK-NEXT: ldr d0, [x2]
1445 ; CHECK-NEXT: uaba.8b v0, v1, v2
1447 %tmp1 = load <8 x i8>, ptr %A
1448 %tmp2 = load <8 x i8>, ptr %B
1449 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1450 %tmp4 = load <8 x i8>, ptr %C
1451 %tmp5 = add <8 x i8> %tmp3, %tmp4
1455 define <16 x i8> @uaba_16b(ptr %A, ptr %B, ptr %C) nounwind {
1456 ; CHECK-LABEL: uaba_16b:
1458 ; CHECK-NEXT: ldr q1, [x0]
1459 ; CHECK-NEXT: ldr q2, [x1]
1460 ; CHECK-NEXT: ldr q0, [x2]
1461 ; CHECK-NEXT: uaba.16b v0, v1, v2
1463 %tmp1 = load <16 x i8>, ptr %A
1464 %tmp2 = load <16 x i8>, ptr %B
1465 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1466 %tmp4 = load <16 x i8>, ptr %C
1467 %tmp5 = add <16 x i8> %tmp3, %tmp4
1471 define <4 x i16> @uaba_4h(ptr %A, ptr %B, ptr %C) nounwind {
1472 ; CHECK-LABEL: uaba_4h:
1474 ; CHECK-NEXT: ldr d1, [x0]
1475 ; CHECK-NEXT: ldr d2, [x1]
1476 ; CHECK-NEXT: ldr d0, [x2]
1477 ; CHECK-NEXT: uaba.4h v0, v1, v2
1479 %tmp1 = load <4 x i16>, ptr %A
1480 %tmp2 = load <4 x i16>, ptr %B
1481 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1482 %tmp4 = load <4 x i16>, ptr %C
1483 %tmp5 = add <4 x i16> %tmp3, %tmp4
1487 define <8 x i16> @uaba_8h(ptr %A, ptr %B, ptr %C) nounwind {
1488 ; CHECK-LABEL: uaba_8h:
1490 ; CHECK-NEXT: ldr q1, [x0]
1491 ; CHECK-NEXT: ldr q2, [x1]
1492 ; CHECK-NEXT: ldr q0, [x2]
1493 ; CHECK-NEXT: uaba.8h v0, v1, v2
1495 %tmp1 = load <8 x i16>, ptr %A
1496 %tmp2 = load <8 x i16>, ptr %B
1497 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1498 %tmp4 = load <8 x i16>, ptr %C
1499 %tmp5 = add <8 x i16> %tmp3, %tmp4
1503 define <2 x i32> @uaba_2s(ptr %A, ptr %B, ptr %C) nounwind {
1504 ; CHECK-LABEL: uaba_2s:
1506 ; CHECK-NEXT: ldr d1, [x0]
1507 ; CHECK-NEXT: ldr d2, [x1]
1508 ; CHECK-NEXT: ldr d0, [x2]
1509 ; CHECK-NEXT: uaba.2s v0, v1, v2
1511 %tmp1 = load <2 x i32>, ptr %A
1512 %tmp2 = load <2 x i32>, ptr %B
1513 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1514 %tmp4 = load <2 x i32>, ptr %C
1515 %tmp5 = add <2 x i32> %tmp3, %tmp4
1519 define <4 x i32> @uaba_4s(ptr %A, ptr %B, ptr %C) nounwind {
1520 ; CHECK-LABEL: uaba_4s:
1522 ; CHECK-NEXT: ldr q1, [x0]
1523 ; CHECK-NEXT: ldr q2, [x1]
1524 ; CHECK-NEXT: ldr q0, [x2]
1525 ; CHECK-NEXT: uaba.4s v0, v1, v2
1527 %tmp1 = load <4 x i32>, ptr %A
1528 %tmp2 = load <4 x i32>, ptr %B
1529 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1530 %tmp4 = load <4 x i32>, ptr %C
1531 %tmp5 = add <4 x i32> %tmp3, %tmp4
1536 define float @fabds(float %a, float %b) nounwind {
1537 ; CHECK-LABEL: fabds:
1539 ; CHECK-NEXT: fabd s0, s0, s1
1541 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
1545 define double @fabdd(double %a, double %b) nounwind {
1546 ; CHECK-LABEL: fabdd:
1548 ; CHECK-NEXT: fabd d0, d0, d1
1550 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
1554 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
1555 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
1557 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
1558 ; CHECK-LABEL: fabds_from_fsub_fabs:
1560 ; CHECK-NEXT: fabd s0, s0, s1
1562 %sub = fsub float %a, %b
1563 %abs = tail call float @llvm.fabs.f32(float %sub)
1567 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
1568 ; CHECK-LABEL: fabdd_from_fsub_fabs:
1570 ; CHECK-NEXT: fabd d0, d0, d1
1572 %sub = fsub double %a, %b
1573 %abs = tail call double @llvm.fabs.f64(double %sub)
1577 declare float @llvm.fabs.f32(float) nounwind readnone
1578 declare double @llvm.fabs.f64(double) nounwind readnone
1580 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1581 ; CHECK-LABEL: uabdl_from_extract_dup:
1583 ; CHECK-NEXT: dup.2s v1, w0
1584 ; CHECK-NEXT: uabdl.2d v0, v0, v1
1586 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1587 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1588 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1589 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1590 %res1 = zext <2 x i32> %res to <2 x i64>
1594 define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1595 ; CHECK-SD-LABEL: uabdl2_from_extract_dup:
1596 ; CHECK-SD: // %bb.0:
1597 ; CHECK-SD-NEXT: dup.4s v1, w0
1598 ; CHECK-SD-NEXT: uabdl2.2d v0, v0, v1
1599 ; CHECK-SD-NEXT: ret
1601 ; CHECK-GI-LABEL: uabdl2_from_extract_dup:
1602 ; CHECK-GI: // %bb.0:
1603 ; CHECK-GI-NEXT: dup.2s v1, w0
1604 ; CHECK-GI-NEXT: mov d0, v0[1]
1605 ; CHECK-GI-NEXT: uabdl.2d v0, v0, v1
1606 ; CHECK-GI-NEXT: ret
1607 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1608 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1609 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1610 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1611 %res1 = zext <2 x i32> %res to <2 x i64>
1615 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1616 ; CHECK-LABEL: sabdl_from_extract_dup:
1618 ; CHECK-NEXT: dup.2s v1, w0
1619 ; CHECK-NEXT: sabdl.2d v0, v0, v1
1621 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1622 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1623 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1624 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1625 %res1 = zext <2 x i32> %res to <2 x i64>
1629 define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1630 ; CHECK-SD-LABEL: sabdl2_from_extract_dup:
1631 ; CHECK-SD: // %bb.0:
1632 ; CHECK-SD-NEXT: dup.4s v1, w0
1633 ; CHECK-SD-NEXT: sabdl2.2d v0, v0, v1
1634 ; CHECK-SD-NEXT: ret
1636 ; CHECK-GI-LABEL: sabdl2_from_extract_dup:
1637 ; CHECK-GI: // %bb.0:
1638 ; CHECK-GI-NEXT: dup.2s v1, w0
1639 ; CHECK-GI-NEXT: mov d0, v0[1]
1640 ; CHECK-GI-NEXT: sabdl.2d v0, v0, v1
1641 ; CHECK-GI-NEXT: ret
1642 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1643 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1644 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1645 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1646 %res1 = zext <2 x i32> %res to <2 x i64>
1650 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
1651 ; CHECK-SD-LABEL: abspattern1:
1652 ; CHECK-SD: // %bb.0:
1653 ; CHECK-SD-NEXT: abs.2s v0, v0
1654 ; CHECK-SD-NEXT: ret
1656 ; CHECK-GI-LABEL: abspattern1:
1657 ; CHECK-GI: // %bb.0:
1658 ; CHECK-GI-NEXT: neg.2s v1, v0
1659 ; CHECK-GI-NEXT: cmge.2s v2, v0, #0
1660 ; CHECK-GI-NEXT: bif.8b v0, v1, v2
1661 ; CHECK-GI-NEXT: ret
1662 %tmp1neg = sub <2 x i32> zeroinitializer, %a
1663 %b = icmp sge <2 x i32> %a, zeroinitializer
1664 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
1668 ; For GlobalISel, this generates terrible code until we can pattern match this to abs.
1669 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
1670 ; CHECK-SD-LABEL: abspattern2:
1671 ; CHECK-SD: // %bb.0:
1672 ; CHECK-SD-NEXT: abs.4h v0, v0
1673 ; CHECK-SD-NEXT: ret
1675 ; CHECK-GI-LABEL: abspattern2:
1676 ; CHECK-GI: // %bb.0:
1677 ; CHECK-GI-NEXT: neg.4h v1, v0
1678 ; CHECK-GI-NEXT: cmgt.4h v2, v0, #0
1679 ; CHECK-GI-NEXT: bif.8b v0, v1, v2
1680 ; CHECK-GI-NEXT: ret
1681 %tmp1neg = sub <4 x i16> zeroinitializer, %a
1682 %b = icmp sgt <4 x i16> %a, zeroinitializer
1683 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
1687 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
1688 ; CHECK-SD-LABEL: abspattern3:
1689 ; CHECK-SD: // %bb.0:
1690 ; CHECK-SD-NEXT: abs.8b v0, v0
1691 ; CHECK-SD-NEXT: ret
1693 ; CHECK-GI-LABEL: abspattern3:
1694 ; CHECK-GI: // %bb.0:
1695 ; CHECK-GI-NEXT: neg.8b v1, v0
1696 ; CHECK-GI-NEXT: cmlt.8b v2, v0, #0
1697 ; CHECK-GI-NEXT: bit.8b v0, v1, v2
1698 ; CHECK-GI-NEXT: ret
1699 %tmp1neg = sub <8 x i8> zeroinitializer, %a
1700 %b = icmp slt <8 x i8> %a, zeroinitializer
1701 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
1705 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
1706 ; CHECK-SD-LABEL: abspattern4:
1707 ; CHECK-SD: // %bb.0:
1708 ; CHECK-SD-NEXT: abs.4s v0, v0
1709 ; CHECK-SD-NEXT: ret
1711 ; CHECK-GI-LABEL: abspattern4:
1712 ; CHECK-GI: // %bb.0:
1713 ; CHECK-GI-NEXT: neg.4s v1, v0
1714 ; CHECK-GI-NEXT: cmge.4s v2, v0, #0
1715 ; CHECK-GI-NEXT: bif.16b v0, v1, v2
1716 ; CHECK-GI-NEXT: ret
1717 %tmp1neg = sub <4 x i32> zeroinitializer, %a
1718 %b = icmp sge <4 x i32> %a, zeroinitializer
1719 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1723 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1724 ; CHECK-SD-LABEL: abspattern5:
1725 ; CHECK-SD: // %bb.0:
1726 ; CHECK-SD-NEXT: abs.8h v0, v0
1727 ; CHECK-SD-NEXT: ret
1729 ; CHECK-GI-LABEL: abspattern5:
1730 ; CHECK-GI: // %bb.0:
1731 ; CHECK-GI-NEXT: neg.8h v1, v0
1732 ; CHECK-GI-NEXT: cmgt.8h v2, v0, #0
1733 ; CHECK-GI-NEXT: bif.16b v0, v1, v2
1734 ; CHECK-GI-NEXT: ret
1735 %tmp1neg = sub <8 x i16> zeroinitializer, %a
1736 %b = icmp sgt <8 x i16> %a, zeroinitializer
1737 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1741 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1742 ; CHECK-SD-LABEL: abspattern6:
1743 ; CHECK-SD: // %bb.0:
1744 ; CHECK-SD-NEXT: abs.16b v0, v0
1745 ; CHECK-SD-NEXT: ret
1747 ; CHECK-GI-LABEL: abspattern6:
1748 ; CHECK-GI: // %bb.0:
1749 ; CHECK-GI-NEXT: neg.16b v1, v0
1750 ; CHECK-GI-NEXT: cmlt.16b v2, v0, #0
1751 ; CHECK-GI-NEXT: bit.16b v0, v1, v2
1752 ; CHECK-GI-NEXT: ret
1753 %tmp1neg = sub <16 x i8> zeroinitializer, %a
1754 %b = icmp slt <16 x i8> %a, zeroinitializer
1755 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1759 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1760 ; CHECK-SD-LABEL: abspattern7:
1761 ; CHECK-SD: // %bb.0:
1762 ; CHECK-SD-NEXT: abs.2d v0, v0
1763 ; CHECK-SD-NEXT: ret
1765 ; CHECK-GI-LABEL: abspattern7:
1766 ; CHECK-GI: // %bb.0:
1767 ; CHECK-GI-NEXT: neg.2d v1, v0
1768 ; CHECK-GI-NEXT: cmle.2d v2, v0, #0
1769 ; CHECK-GI-NEXT: bit.16b v0, v1, v2
1770 ; CHECK-GI-NEXT: ret
1771 %tmp1neg = sub <2 x i64> zeroinitializer, %a
1772 %b = icmp sle <2 x i64> %a, zeroinitializer
1773 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1777 define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
1778 ; CHECK-SD-LABEL: uabd_i32:
1779 ; CHECK-SD: // %bb.0:
1780 ; CHECK-SD-NEXT: sabdl.2d v0, v0, v1
1781 ; CHECK-SD-NEXT: ret
1783 ; CHECK-GI-LABEL: uabd_i32:
1784 ; CHECK-GI: // %bb.0:
1785 ; CHECK-GI-NEXT: ssubl.2d v0, v0, v1
1786 ; CHECK-GI-NEXT: cmlt.2d v1, v0, #0
1787 ; CHECK-GI-NEXT: neg.2d v2, v0
1788 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
1789 ; CHECK-GI-NEXT: ret
1790 %aext = sext <2 x i32> %a to <2 x i64>
1791 %bext = sext <2 x i32> %b to <2 x i64>
1792 %abdiff = sub nsw <2 x i64> %aext, %bext
1793 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
1794 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
1795 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
1796 ret <2 x i64> %absel
1799 define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
1800 ; CHECK-LABEL: uabd_i64:
1802 ; CHECK-NEXT: mov.d x8, v0[1]
1803 ; CHECK-NEXT: mov.d x9, v1[1]
1804 ; CHECK-NEXT: fmov x10, d0
1805 ; CHECK-NEXT: fmov x11, d1
1806 ; CHECK-NEXT: asr x12, x10, #63
1807 ; CHECK-NEXT: asr x13, x11, #63
1808 ; CHECK-NEXT: subs x10, x10, x11
1809 ; CHECK-NEXT: asr x11, x8, #63
1810 ; CHECK-NEXT: asr x14, x9, #63
1811 ; CHECK-NEXT: sbc x12, x12, x13
1812 ; CHECK-NEXT: subs x8, x8, x9
1813 ; CHECK-NEXT: sbc x9, x11, x14
1814 ; CHECK-NEXT: asr x13, x12, #63
1815 ; CHECK-NEXT: asr x11, x9, #63
1816 ; CHECK-NEXT: eor x10, x10, x13
1817 ; CHECK-NEXT: eor x8, x8, x11
1818 ; CHECK-NEXT: eor x9, x9, x11
1819 ; CHECK-NEXT: subs x2, x8, x11
1820 ; CHECK-NEXT: eor x8, x12, x13
1821 ; CHECK-NEXT: sbc x3, x9, x11
1822 ; CHECK-NEXT: subs x9, x10, x13
1823 ; CHECK-NEXT: fmov d0, x9
1824 ; CHECK-NEXT: sbc x1, x8, x13
1825 ; CHECK-NEXT: mov.d v0[1], x1
1826 ; CHECK-NEXT: fmov x0, d0
1828 %aext = sext <2 x i64> %a to <2 x i128>
1829 %bext = sext <2 x i64> %b to <2 x i128>
1830 %abdiff = sub nsw <2 x i128> %aext, %bext
1831 %abcmp = icmp slt <2 x i128> %abdiff, zeroinitializer
1832 %ababs = sub nsw <2 x i128> zeroinitializer, %abdiff
1833 %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff
1834 ret <2 x i128> %absel
1837 define <8 x i16> @pr88784(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
1838 ; CHECK-SD-LABEL: pr88784:
1839 ; CHECK-SD: // %bb.0:
1840 ; CHECK-SD-NEXT: usubl.8h v0, v0, v1
1841 ; CHECK-SD-NEXT: cmlt.8h v1, v2, #0
1842 ; CHECK-SD-NEXT: ssra.8h v0, v2, #15
1843 ; CHECK-SD-NEXT: eor.16b v0, v1, v0
1844 ; CHECK-SD-NEXT: ret
1846 ; CHECK-GI-LABEL: pr88784:
1847 ; CHECK-GI: // %bb.0:
1848 ; CHECK-GI-NEXT: usubl.8h v0, v0, v1
1849 ; CHECK-GI-NEXT: sshr.8h v1, v2, #15
1850 ; CHECK-GI-NEXT: ssra.8h v0, v2, #15
1851 ; CHECK-GI-NEXT: eor.16b v0, v1, v0
1852 ; CHECK-GI-NEXT: ret
1853 %l4 = zext <8 x i8> %l0 to <8 x i16>
1854 %l5 = ashr <8 x i16> %l2, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1855 %l6 = zext <8 x i8> %l1 to <8 x i16>
1856 %l7 = sub <8 x i16> %l4, %l6
1857 %l8 = add <8 x i16> %l5, %l7
1858 %l9 = xor <8 x i16> %l5, %l8
1862 define <8 x i16> @pr88784_fixed(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
1863 ; CHECK-SD-LABEL: pr88784_fixed:
1864 ; CHECK-SD: // %bb.0:
1865 ; CHECK-SD-NEXT: uabdl.8h v0, v0, v1
1866 ; CHECK-SD-NEXT: ret
1868 ; CHECK-GI-LABEL: pr88784_fixed:
1869 ; CHECK-GI: // %bb.0:
1870 ; CHECK-GI-NEXT: usubl.8h v0, v0, v1
1871 ; CHECK-GI-NEXT: sshr.8h v1, v0, #15
1872 ; CHECK-GI-NEXT: ssra.8h v0, v0, #15
1873 ; CHECK-GI-NEXT: eor.16b v0, v1, v0
1874 ; CHECK-GI-NEXT: ret
1875 %l4 = zext <8 x i8> %l0 to <8 x i16>
1876 %l6 = zext <8 x i8> %l1 to <8 x i16>
1877 %l7 = sub <8 x i16> %l4, %l6
1878 %l5 = ashr <8 x i16> %l7, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1879 %l8 = add <8 x i16> %l5, %l7
1880 %l9 = xor <8 x i16> %l5, %l8