1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck -check-prefixes=CHECK,CHECK-SD %s
3 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 ; CHECK-GI: warning: Instruction selection used fallback path for uabd16b_rdx
6 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uabd4s_rdx
7 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sabd4s_rdx
8 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_8b
9 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_16b
10 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_4h
11 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_8h
12 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_2s
13 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_4s
14 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_1d
15 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for abs_1d_honestly
16 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fabds
17 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fabdd
18 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uabd_i64
20 define <8 x i16> @sabdl8h(ptr %A, ptr %B) nounwind {
21 ; CHECK-LABEL: sabdl8h:
23 ; CHECK-NEXT: ldr d0, [x0]
24 ; CHECK-NEXT: ldr d1, [x1]
25 ; CHECK-NEXT: sabdl.8h v0, v0, v1
27 %tmp1 = load <8 x i8>, ptr %A
28 %tmp2 = load <8 x i8>, ptr %B
29 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
30 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
34 define <4 x i32> @sabdl4s(ptr %A, ptr %B) nounwind {
35 ; CHECK-LABEL: sabdl4s:
37 ; CHECK-NEXT: ldr d0, [x0]
38 ; CHECK-NEXT: ldr d1, [x1]
39 ; CHECK-NEXT: sabdl.4s v0, v0, v1
41 %tmp1 = load <4 x i16>, ptr %A
42 %tmp2 = load <4 x i16>, ptr %B
43 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
44 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
48 define <2 x i64> @sabdl2d(ptr %A, ptr %B) nounwind {
49 ; CHECK-LABEL: sabdl2d:
51 ; CHECK-NEXT: ldr d0, [x0]
52 ; CHECK-NEXT: ldr d1, [x1]
53 ; CHECK-NEXT: sabdl.2d v0, v0, v1
55 %tmp1 = load <2 x i32>, ptr %A
56 %tmp2 = load <2 x i32>, ptr %B
57 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
58 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
62 define <8 x i16> @sabdl2_8h(ptr %A, ptr %B) nounwind {
63 ; CHECK-SD-LABEL: sabdl2_8h:
65 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
66 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
67 ; CHECK-SD-NEXT: sabdl.8h v0, v0, v1
70 ; CHECK-GI-LABEL: sabdl2_8h:
72 ; CHECK-GI-NEXT: ldr q0, [x0]
73 ; CHECK-GI-NEXT: ldr q1, [x1]
74 ; CHECK-GI-NEXT: sabdl2.8h v0, v0, v1
76 %load1 = load <16 x i8>, ptr %A
77 %load2 = load <16 x i8>, ptr %B
78 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
79 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
80 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
81 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
85 define <4 x i32> @sabdl2_4s(ptr %A, ptr %B) nounwind {
86 ; CHECK-SD-LABEL: sabdl2_4s:
88 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
89 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
90 ; CHECK-SD-NEXT: sabdl.4s v0, v0, v1
93 ; CHECK-GI-LABEL: sabdl2_4s:
95 ; CHECK-GI-NEXT: ldr q0, [x0]
96 ; CHECK-GI-NEXT: ldr q1, [x1]
97 ; CHECK-GI-NEXT: sabdl2.4s v0, v0, v1
99 %load1 = load <8 x i16>, ptr %A
100 %load2 = load <8 x i16>, ptr %B
101 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
102 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
103 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
104 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
108 define <2 x i64> @sabdl2_2d(ptr %A, ptr %B) nounwind {
109 ; CHECK-SD-LABEL: sabdl2_2d:
110 ; CHECK-SD: // %bb.0:
111 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
112 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
113 ; CHECK-SD-NEXT: sabdl.2d v0, v0, v1
116 ; CHECK-GI-LABEL: sabdl2_2d:
117 ; CHECK-GI: // %bb.0:
118 ; CHECK-GI-NEXT: ldr q0, [x0]
119 ; CHECK-GI-NEXT: ldr q1, [x1]
120 ; CHECK-GI-NEXT: sabdl2.2d v0, v0, v1
122 %load1 = load <4 x i32>, ptr %A
123 %load2 = load <4 x i32>, ptr %B
124 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
125 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
126 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
127 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
131 define <8 x i16> @uabdl8h(ptr %A, ptr %B) nounwind {
132 ; CHECK-LABEL: uabdl8h:
134 ; CHECK-NEXT: ldr d0, [x0]
135 ; CHECK-NEXT: ldr d1, [x1]
136 ; CHECK-NEXT: uabdl.8h v0, v0, v1
138 %tmp1 = load <8 x i8>, ptr %A
139 %tmp2 = load <8 x i8>, ptr %B
140 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
141 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
145 define <4 x i32> @uabdl4s(ptr %A, ptr %B) nounwind {
146 ; CHECK-LABEL: uabdl4s:
148 ; CHECK-NEXT: ldr d0, [x0]
149 ; CHECK-NEXT: ldr d1, [x1]
150 ; CHECK-NEXT: uabdl.4s v0, v0, v1
152 %tmp1 = load <4 x i16>, ptr %A
153 %tmp2 = load <4 x i16>, ptr %B
154 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
155 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
159 define <2 x i64> @uabdl2d(ptr %A, ptr %B) nounwind {
160 ; CHECK-LABEL: uabdl2d:
162 ; CHECK-NEXT: ldr d0, [x0]
163 ; CHECK-NEXT: ldr d1, [x1]
164 ; CHECK-NEXT: uabdl.2d v0, v0, v1
166 %tmp1 = load <2 x i32>, ptr %A
167 %tmp2 = load <2 x i32>, ptr %B
168 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
169 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
173 define <8 x i16> @uabdl2_8h(ptr %A, ptr %B) nounwind {
174 ; CHECK-SD-LABEL: uabdl2_8h:
175 ; CHECK-SD: // %bb.0:
176 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
177 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
178 ; CHECK-SD-NEXT: uabdl.8h v0, v0, v1
181 ; CHECK-GI-LABEL: uabdl2_8h:
182 ; CHECK-GI: // %bb.0:
183 ; CHECK-GI-NEXT: ldr q0, [x0]
184 ; CHECK-GI-NEXT: ldr q1, [x1]
185 ; CHECK-GI-NEXT: uabdl2.8h v0, v0, v1
187 %load1 = load <16 x i8>, ptr %A
188 %load2 = load <16 x i8>, ptr %B
189 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
190 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
192 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
193 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
197 define <4 x i32> @uabdl2_4s(ptr %A, ptr %B) nounwind {
198 ; CHECK-SD-LABEL: uabdl2_4s:
199 ; CHECK-SD: // %bb.0:
200 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
201 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
202 ; CHECK-SD-NEXT: uabdl.4s v0, v0, v1
205 ; CHECK-GI-LABEL: uabdl2_4s:
206 ; CHECK-GI: // %bb.0:
207 ; CHECK-GI-NEXT: ldr q0, [x0]
208 ; CHECK-GI-NEXT: ldr q1, [x1]
209 ; CHECK-GI-NEXT: uabdl2.4s v0, v0, v1
211 %load1 = load <8 x i16>, ptr %A
212 %load2 = load <8 x i16>, ptr %B
213 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
214 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
215 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
216 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
220 define <2 x i64> @uabdl2_2d(ptr %A, ptr %B) nounwind {
221 ; CHECK-SD-LABEL: uabdl2_2d:
222 ; CHECK-SD: // %bb.0:
223 ; CHECK-SD-NEXT: ldr d0, [x0, #8]
224 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
225 ; CHECK-SD-NEXT: uabdl.2d v0, v0, v1
228 ; CHECK-GI-LABEL: uabdl2_2d:
229 ; CHECK-GI: // %bb.0:
230 ; CHECK-GI-NEXT: ldr q0, [x0]
231 ; CHECK-GI-NEXT: ldr q1, [x1]
232 ; CHECK-GI-NEXT: uabdl2.2d v0, v0, v1
234 %load1 = load <4 x i32>, ptr %A
235 %load2 = load <4 x i32>, ptr %B
236 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
237 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
238 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
239 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
243 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
244 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
246 define i16 @uabd16b_rdx(ptr %a, ptr %b) {
247 ; CHECK-LABEL: uabd16b_rdx:
249 ; CHECK-NEXT: ldr q0, [x0]
250 ; CHECK-NEXT: ldr q1, [x1]
251 ; CHECK-NEXT: uabd.16b v0, v0, v1
252 ; CHECK-NEXT: uaddlv.16b h0, v0
253 ; CHECK-NEXT: fmov w0, s0
255 %aload = load <16 x i8>, ptr %a, align 1
256 %bload = load <16 x i8>, ptr %b, align 1
257 %aext = zext <16 x i8> %aload to <16 x i16>
258 %bext = zext <16 x i8> %bload to <16 x i16>
259 %abdiff = sub nsw <16 x i16> %aext, %bext
260 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
261 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
262 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
263 %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel)
267 define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
268 ; CHECK-SD-LABEL: uabd16b_rdx_i32:
269 ; CHECK-SD: // %bb.0:
270 ; CHECK-SD-NEXT: uabdl.8h v2, v0, v1
271 ; CHECK-SD-NEXT: uabal2.8h v2, v0, v1
272 ; CHECK-SD-NEXT: uaddlv.8h s0, v2
273 ; CHECK-SD-NEXT: fmov w0, s0
276 ; CHECK-GI-LABEL: uabd16b_rdx_i32:
277 ; CHECK-GI: // %bb.0:
278 ; CHECK-GI-NEXT: ushll.8h v3, v0, #0
279 ; CHECK-GI-NEXT: ushll.8h v4, v1, #0
280 ; CHECK-GI-NEXT: ushll2.8h v0, v0, #0
281 ; CHECK-GI-NEXT: ushll2.8h v1, v1, #0
282 ; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
283 ; CHECK-GI-NEXT: usubl.4s v5, v3, v4
284 ; CHECK-GI-NEXT: usubl2.4s v3, v3, v4
285 ; CHECK-GI-NEXT: usubl.4s v4, v0, v1
286 ; CHECK-GI-NEXT: usubl2.4s v0, v0, v1
287 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
288 ; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
289 ; CHECK-GI-NEXT: neg.4s v16, v5
290 ; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
291 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
292 ; CHECK-GI-NEXT: neg.4s v17, v3
293 ; CHECK-GI-NEXT: neg.4s v18, v4
294 ; CHECK-GI-NEXT: neg.4s v19, v0
295 ; CHECK-GI-NEXT: shl.4s v1, v1, #31
296 ; CHECK-GI-NEXT: shl.4s v6, v6, #31
297 ; CHECK-GI-NEXT: shl.4s v7, v7, #31
298 ; CHECK-GI-NEXT: shl.4s v2, v2, #31
299 ; CHECK-GI-NEXT: sshr.4s v1, v1, #31
300 ; CHECK-GI-NEXT: sshr.4s v6, v6, #31
301 ; CHECK-GI-NEXT: sshr.4s v7, v7, #31
302 ; CHECK-GI-NEXT: sshr.4s v2, v2, #31
303 ; CHECK-GI-NEXT: bsl.16b v1, v16, v5
304 ; CHECK-GI-NEXT: bit.16b v3, v17, v6
305 ; CHECK-GI-NEXT: bit.16b v4, v18, v7
306 ; CHECK-GI-NEXT: bit.16b v0, v19, v2
307 ; CHECK-GI-NEXT: add.4s v1, v1, v3
308 ; CHECK-GI-NEXT: add.4s v0, v4, v0
309 ; CHECK-GI-NEXT: add.4s v0, v1, v0
310 ; CHECK-GI-NEXT: addv.4s s0, v0
311 ; CHECK-GI-NEXT: fmov w0, s0
313 %aext = zext <16 x i8> %a to <16 x i32>
314 %bext = zext <16 x i8> %b to <16 x i32>
315 %abdiff = sub nsw <16 x i32> %aext, %bext
316 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
317 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
318 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
319 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
323 define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
324 ; CHECK-SD-LABEL: sabd16b_rdx_i32:
325 ; CHECK-SD: // %bb.0:
326 ; CHECK-SD-NEXT: sabdl.8h v2, v0, v1
327 ; CHECK-SD-NEXT: sabal2.8h v2, v0, v1
328 ; CHECK-SD-NEXT: uaddlv.8h s0, v2
329 ; CHECK-SD-NEXT: fmov w0, s0
332 ; CHECK-GI-LABEL: sabd16b_rdx_i32:
333 ; CHECK-GI: // %bb.0:
334 ; CHECK-GI-NEXT: sshll.8h v3, v0, #0
335 ; CHECK-GI-NEXT: sshll.8h v4, v1, #0
336 ; CHECK-GI-NEXT: sshll2.8h v0, v0, #0
337 ; CHECK-GI-NEXT: sshll2.8h v1, v1, #0
338 ; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
339 ; CHECK-GI-NEXT: ssubl.4s v5, v3, v4
340 ; CHECK-GI-NEXT: ssubl2.4s v3, v3, v4
341 ; CHECK-GI-NEXT: ssubl.4s v4, v0, v1
342 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
343 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5
344 ; CHECK-GI-NEXT: cmgt.4s v6, v2, v3
345 ; CHECK-GI-NEXT: neg.4s v16, v5
346 ; CHECK-GI-NEXT: cmgt.4s v7, v2, v4
347 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
348 ; CHECK-GI-NEXT: neg.4s v17, v3
349 ; CHECK-GI-NEXT: neg.4s v18, v4
350 ; CHECK-GI-NEXT: neg.4s v19, v0
351 ; CHECK-GI-NEXT: shl.4s v1, v1, #31
352 ; CHECK-GI-NEXT: shl.4s v6, v6, #31
353 ; CHECK-GI-NEXT: shl.4s v7, v7, #31
354 ; CHECK-GI-NEXT: shl.4s v2, v2, #31
355 ; CHECK-GI-NEXT: sshr.4s v1, v1, #31
356 ; CHECK-GI-NEXT: sshr.4s v6, v6, #31
357 ; CHECK-GI-NEXT: sshr.4s v7, v7, #31
358 ; CHECK-GI-NEXT: sshr.4s v2, v2, #31
359 ; CHECK-GI-NEXT: bsl.16b v1, v16, v5
360 ; CHECK-GI-NEXT: bit.16b v3, v17, v6
361 ; CHECK-GI-NEXT: bit.16b v4, v18, v7
362 ; CHECK-GI-NEXT: bit.16b v0, v19, v2
363 ; CHECK-GI-NEXT: add.4s v1, v1, v3
364 ; CHECK-GI-NEXT: add.4s v0, v4, v0
365 ; CHECK-GI-NEXT: add.4s v0, v1, v0
366 ; CHECK-GI-NEXT: addv.4s s0, v0
367 ; CHECK-GI-NEXT: fmov w0, s0
369 %aext = sext <16 x i8> %a to <16 x i32>
370 %bext = sext <16 x i8> %b to <16 x i32>
371 %abdiff = sub nsw <16 x i32> %aext, %bext
372 %abcmp = icmp slt <16 x i32> %abdiff, zeroinitializer
373 %ababs = sub nsw <16 x i32> zeroinitializer, %abdiff
374 %absel = select <16 x i1> %abcmp, <16 x i32> %ababs, <16 x i32> %abdiff
375 %reduced_v = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %absel)
380 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
381 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
383 define i32 @uabd8h_rdx(ptr %a, ptr %b) {
384 ; CHECK-SD-LABEL: uabd8h_rdx:
385 ; CHECK-SD: // %bb.0:
386 ; CHECK-SD-NEXT: ldr q0, [x0]
387 ; CHECK-SD-NEXT: ldr q1, [x1]
388 ; CHECK-SD-NEXT: uabd.8h v0, v0, v1
389 ; CHECK-SD-NEXT: uaddlv.8h s0, v0
390 ; CHECK-SD-NEXT: fmov w0, s0
393 ; CHECK-GI-LABEL: uabd8h_rdx:
394 ; CHECK-GI: // %bb.0:
395 ; CHECK-GI-NEXT: ldr q1, [x0]
396 ; CHECK-GI-NEXT: ldr q2, [x1]
397 ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000
398 ; CHECK-GI-NEXT: usubl.4s v3, v1, v2
399 ; CHECK-GI-NEXT: usubl2.4s v1, v1, v2
400 ; CHECK-GI-NEXT: cmgt.4s v2, v0, v3
401 ; CHECK-GI-NEXT: cmgt.4s v0, v0, v1
402 ; CHECK-GI-NEXT: neg.4s v4, v3
403 ; CHECK-GI-NEXT: neg.4s v5, v1
404 ; CHECK-GI-NEXT: shl.4s v2, v2, #31
405 ; CHECK-GI-NEXT: shl.4s v0, v0, #31
406 ; CHECK-GI-NEXT: sshr.4s v2, v2, #31
407 ; CHECK-GI-NEXT: sshr.4s v0, v0, #31
408 ; CHECK-GI-NEXT: bsl.16b v2, v4, v3
409 ; CHECK-GI-NEXT: bsl.16b v0, v5, v1
410 ; CHECK-GI-NEXT: add.4s v0, v2, v0
411 ; CHECK-GI-NEXT: addv.4s s0, v0
412 ; CHECK-GI-NEXT: fmov w0, s0
414 %aload = load <8 x i16>, ptr %a, align 1
415 %bload = load <8 x i16>, ptr %b, align 1
416 %aext = zext <8 x i16> %aload to <8 x i32>
417 %bext = zext <8 x i16> %bload to <8 x i32>
418 %abdiff = sub nsw <8 x i32> %aext, %bext
419 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
420 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
421 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
422 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
426 define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
427 ; CHECK-SD-LABEL: sabd8h_rdx:
428 ; CHECK-SD: // %bb.0:
429 ; CHECK-SD-NEXT: sabd.8h v0, v0, v1
430 ; CHECK-SD-NEXT: uaddlv.8h s0, v0
431 ; CHECK-SD-NEXT: fmov w0, s0
434 ; CHECK-GI-LABEL: sabd8h_rdx:
435 ; CHECK-GI: // %bb.0:
436 ; CHECK-GI-NEXT: movi.2d v2, #0000000000000000
437 ; CHECK-GI-NEXT: ssubl.4s v3, v0, v1
438 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1
439 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v3
440 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0
441 ; CHECK-GI-NEXT: neg.4s v4, v3
442 ; CHECK-GI-NEXT: neg.4s v5, v0
443 ; CHECK-GI-NEXT: shl.4s v1, v1, #31
444 ; CHECK-GI-NEXT: shl.4s v2, v2, #31
445 ; CHECK-GI-NEXT: sshr.4s v1, v1, #31
446 ; CHECK-GI-NEXT: sshr.4s v2, v2, #31
447 ; CHECK-GI-NEXT: bsl.16b v1, v4, v3
448 ; CHECK-GI-NEXT: bit.16b v0, v5, v2
449 ; CHECK-GI-NEXT: add.4s v0, v1, v0
450 ; CHECK-GI-NEXT: addv.4s s0, v0
451 ; CHECK-GI-NEXT: fmov w0, s0
453 %aext = sext <8 x i16> %a to <8 x i32>
454 %bext = sext <8 x i16> %b to <8 x i32>
455 %abdiff = sub nsw <8 x i32> %aext, %bext
456 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
457 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
458 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
459 %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel)
463 define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
464 ; CHECK-SD-LABEL: uabdl4s_rdx_i32:
465 ; CHECK-SD: // %bb.0:
466 ; CHECK-SD-NEXT: uabdl.4s v0, v0, v1
467 ; CHECK-SD-NEXT: addv.4s s0, v0
468 ; CHECK-SD-NEXT: fmov w0, s0
471 ; CHECK-GI-LABEL: uabdl4s_rdx_i32:
472 ; CHECK-GI: // %bb.0:
473 ; CHECK-GI-NEXT: usubl.4s v0, v0, v1
474 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
475 ; CHECK-GI-NEXT: neg.4s v2, v0
476 ; CHECK-GI-NEXT: cmgt.4s v1, v1, v0
477 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
478 ; CHECK-GI-NEXT: addv.4s s0, v0
479 ; CHECK-GI-NEXT: fmov w0, s0
481 %aext = zext <4 x i16> %a to <4 x i32>
482 %bext = zext <4 x i16> %b to <4 x i32>
483 %abdiff = sub nsw <4 x i32> %aext, %bext
484 %abcmp = icmp slt <4 x i32> %abdiff, zeroinitializer
485 %ababs = sub nsw <4 x i32> zeroinitializer, %abdiff
486 %absel = select <4 x i1> %abcmp, <4 x i32> %ababs, <4 x i32> %abdiff
487 %reduced_v = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %absel)
491 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
492 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
494 define i64 @uabd4s_rdx(ptr %a, ptr %b, i32 %h) {
495 ; CHECK-LABEL: uabd4s_rdx:
497 ; CHECK-NEXT: ldr q0, [x0]
498 ; CHECK-NEXT: ldr q1, [x1]
499 ; CHECK-NEXT: uabd.4s v0, v0, v1
500 ; CHECK-NEXT: uaddlv.4s d0, v0
501 ; CHECK-NEXT: fmov x0, d0
503 %aload = load <4 x i32>, ptr %a, align 1
504 %bload = load <4 x i32>, ptr %b, align 1
505 %aext = zext <4 x i32> %aload to <4 x i64>
506 %bext = zext <4 x i32> %bload to <4 x i64>
507 %abdiff = sub nsw <4 x i64> %aext, %bext
508 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
509 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
510 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
511 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
515 define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
516 ; CHECK-LABEL: sabd4s_rdx:
518 ; CHECK-NEXT: sabd.4s v0, v0, v1
519 ; CHECK-NEXT: uaddlv.4s d0, v0
520 ; CHECK-NEXT: fmov x0, d0
522 %aext = sext <4 x i32> %a to <4 x i64>
523 %bext = sext <4 x i32> %b to <4 x i64>
524 %abdiff = sub nsw <4 x i64> %aext, %bext
525 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
526 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
527 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
528 %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel)
532 define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
533 ; CHECK-SD-LABEL: uabdl2d_rdx_i64:
534 ; CHECK-SD: // %bb.0:
535 ; CHECK-SD-NEXT: uabdl.2d v0, v0, v1
536 ; CHECK-SD-NEXT: addp.2d d0, v0
537 ; CHECK-SD-NEXT: fmov x0, d0
540 ; CHECK-GI-LABEL: uabdl2d_rdx_i64:
541 ; CHECK-GI: // %bb.0:
542 ; CHECK-GI-NEXT: usubl.2d v0, v0, v1
543 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
544 ; CHECK-GI-NEXT: neg.2d v2, v0
545 ; CHECK-GI-NEXT: cmgt.2d v1, v1, v0
546 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
547 ; CHECK-GI-NEXT: addp.2d d0, v0
548 ; CHECK-GI-NEXT: fmov x0, d0
550 %aext = zext <2 x i32> %a to <2 x i64>
551 %bext = zext <2 x i32> %b to <2 x i64>
552 %abdiff = sub nsw <2 x i64> %aext, %bext
553 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
554 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
555 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
556 %reduced_v = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %absel)
560 define <2 x float> @fabd_2s(ptr %A, ptr %B) nounwind {
561 ; CHECK-LABEL: fabd_2s:
563 ; CHECK-NEXT: ldr d0, [x0]
564 ; CHECK-NEXT: ldr d1, [x1]
565 ; CHECK-NEXT: fabd.2s v0, v0, v1
567 %tmp1 = load <2 x float>, ptr %A
568 %tmp2 = load <2 x float>, ptr %B
569 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
570 ret <2 x float> %tmp3
573 define <4 x float> @fabd_4s(ptr %A, ptr %B) nounwind {
574 ; CHECK-LABEL: fabd_4s:
576 ; CHECK-NEXT: ldr q0, [x0]
577 ; CHECK-NEXT: ldr q1, [x1]
578 ; CHECK-NEXT: fabd.4s v0, v0, v1
580 %tmp1 = load <4 x float>, ptr %A
581 %tmp2 = load <4 x float>, ptr %B
582 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
583 ret <4 x float> %tmp3
586 define <2 x double> @fabd_2d(ptr %A, ptr %B) nounwind {
587 ; CHECK-LABEL: fabd_2d:
589 ; CHECK-NEXT: ldr q0, [x0]
590 ; CHECK-NEXT: ldr q1, [x1]
591 ; CHECK-NEXT: fabd.2d v0, v0, v1
593 %tmp1 = load <2 x double>, ptr %A
594 %tmp2 = load <2 x double>, ptr %B
595 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
596 ret <2 x double> %tmp3
599 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
600 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
601 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
603 define <2 x float> @fabd_2s_from_fsub_fabs(ptr %A, ptr %B) nounwind {
604 ; CHECK-LABEL: fabd_2s_from_fsub_fabs:
606 ; CHECK-NEXT: ldr d0, [x0]
607 ; CHECK-NEXT: ldr d1, [x1]
608 ; CHECK-NEXT: fabd.2s v0, v0, v1
610 %tmp1 = load <2 x float>, ptr %A
611 %tmp2 = load <2 x float>, ptr %B
612 %sub = fsub <2 x float> %tmp1, %tmp2
613 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
617 define <4 x float> @fabd_4s_from_fsub_fabs(ptr %A, ptr %B) nounwind {
618 ; CHECK-LABEL: fabd_4s_from_fsub_fabs:
620 ; CHECK-NEXT: ldr q0, [x0]
621 ; CHECK-NEXT: ldr q1, [x1]
622 ; CHECK-NEXT: fabd.4s v0, v0, v1
624 %tmp1 = load <4 x float>, ptr %A
625 %tmp2 = load <4 x float>, ptr %B
626 %sub = fsub <4 x float> %tmp1, %tmp2
627 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
631 define <2 x double> @fabd_2d_from_fsub_fabs(ptr %A, ptr %B) nounwind {
632 ; CHECK-LABEL: fabd_2d_from_fsub_fabs:
634 ; CHECK-NEXT: ldr q0, [x0]
635 ; CHECK-NEXT: ldr q1, [x1]
636 ; CHECK-NEXT: fabd.2d v0, v0, v1
638 %tmp1 = load <2 x double>, ptr %A
639 %tmp2 = load <2 x double>, ptr %B
640 %sub = fsub <2 x double> %tmp1, %tmp2
641 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
642 ret <2 x double> %abs
645 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
646 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
647 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
649 define <8 x i8> @sabd_8b(ptr %A, ptr %B) nounwind {
650 ; CHECK-LABEL: sabd_8b:
652 ; CHECK-NEXT: ldr d0, [x0]
653 ; CHECK-NEXT: ldr d1, [x1]
654 ; CHECK-NEXT: sabd.8b v0, v0, v1
656 %tmp1 = load <8 x i8>, ptr %A
657 %tmp2 = load <8 x i8>, ptr %B
658 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
662 define <16 x i8> @sabd_16b(ptr %A, ptr %B) nounwind {
663 ; CHECK-LABEL: sabd_16b:
665 ; CHECK-NEXT: ldr q0, [x0]
666 ; CHECK-NEXT: ldr q1, [x1]
667 ; CHECK-NEXT: sabd.16b v0, v0, v1
669 %tmp1 = load <16 x i8>, ptr %A
670 %tmp2 = load <16 x i8>, ptr %B
671 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
675 define <4 x i16> @sabd_4h(ptr %A, ptr %B) nounwind {
676 ; CHECK-LABEL: sabd_4h:
678 ; CHECK-NEXT: ldr d0, [x0]
679 ; CHECK-NEXT: ldr d1, [x1]
680 ; CHECK-NEXT: sabd.4h v0, v0, v1
682 %tmp1 = load <4 x i16>, ptr %A
683 %tmp2 = load <4 x i16>, ptr %B
684 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
688 define <8 x i16> @sabd_8h(ptr %A, ptr %B) nounwind {
689 ; CHECK-LABEL: sabd_8h:
691 ; CHECK-NEXT: ldr q0, [x0]
692 ; CHECK-NEXT: ldr q1, [x1]
693 ; CHECK-NEXT: sabd.8h v0, v0, v1
695 %tmp1 = load <8 x i16>, ptr %A
696 %tmp2 = load <8 x i16>, ptr %B
697 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
701 define <2 x i32> @sabd_2s(ptr %A, ptr %B) nounwind {
702 ; CHECK-LABEL: sabd_2s:
704 ; CHECK-NEXT: ldr d0, [x0]
705 ; CHECK-NEXT: ldr d1, [x1]
706 ; CHECK-NEXT: sabd.2s v0, v0, v1
708 %tmp1 = load <2 x i32>, ptr %A
709 %tmp2 = load <2 x i32>, ptr %B
710 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
714 define <4 x i32> @sabd_4s(ptr %A, ptr %B) nounwind {
715 ; CHECK-LABEL: sabd_4s:
717 ; CHECK-NEXT: ldr q0, [x0]
718 ; CHECK-NEXT: ldr q1, [x1]
719 ; CHECK-NEXT: sabd.4s v0, v0, v1
721 %tmp1 = load <4 x i32>, ptr %A
722 %tmp2 = load <4 x i32>, ptr %B
723 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
727 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
728 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
729 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
730 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
731 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
732 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
734 define <8 x i8> @uabd_8b(ptr %A, ptr %B) nounwind {
735 ; CHECK-LABEL: uabd_8b:
737 ; CHECK-NEXT: ldr d0, [x0]
738 ; CHECK-NEXT: ldr d1, [x1]
739 ; CHECK-NEXT: uabd.8b v0, v0, v1
741 %tmp1 = load <8 x i8>, ptr %A
742 %tmp2 = load <8 x i8>, ptr %B
743 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
747 define <16 x i8> @uabd_16b(ptr %A, ptr %B) nounwind {
748 ; CHECK-LABEL: uabd_16b:
750 ; CHECK-NEXT: ldr q0, [x0]
751 ; CHECK-NEXT: ldr q1, [x1]
752 ; CHECK-NEXT: uabd.16b v0, v0, v1
754 %tmp1 = load <16 x i8>, ptr %A
755 %tmp2 = load <16 x i8>, ptr %B
756 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
760 define <4 x i16> @uabd_4h(ptr %A, ptr %B) nounwind {
761 ; CHECK-LABEL: uabd_4h:
763 ; CHECK-NEXT: ldr d0, [x0]
764 ; CHECK-NEXT: ldr d1, [x1]
765 ; CHECK-NEXT: uabd.4h v0, v0, v1
767 %tmp1 = load <4 x i16>, ptr %A
768 %tmp2 = load <4 x i16>, ptr %B
769 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
773 define <8 x i16> @uabd_8h(ptr %A, ptr %B) nounwind {
774 ; CHECK-LABEL: uabd_8h:
776 ; CHECK-NEXT: ldr q0, [x0]
777 ; CHECK-NEXT: ldr q1, [x1]
778 ; CHECK-NEXT: uabd.8h v0, v0, v1
780 %tmp1 = load <8 x i16>, ptr %A
781 %tmp2 = load <8 x i16>, ptr %B
782 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
786 define <2 x i32> @uabd_2s(ptr %A, ptr %B) nounwind {
787 ; CHECK-LABEL: uabd_2s:
789 ; CHECK-NEXT: ldr d0, [x0]
790 ; CHECK-NEXT: ldr d1, [x1]
791 ; CHECK-NEXT: uabd.2s v0, v0, v1
793 %tmp1 = load <2 x i32>, ptr %A
794 %tmp2 = load <2 x i32>, ptr %B
795 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
799 define <4 x i32> @uabd_4s(ptr %A, ptr %B) nounwind {
800 ; CHECK-LABEL: uabd_4s:
802 ; CHECK-NEXT: ldr q0, [x0]
803 ; CHECK-NEXT: ldr q1, [x1]
804 ; CHECK-NEXT: uabd.4s v0, v0, v1
806 %tmp1 = load <4 x i32>, ptr %A
807 %tmp2 = load <4 x i32>, ptr %B
808 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
812 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
813 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
814 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
815 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
816 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
817 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
819 define <8 x i8> @sqabs_8b(ptr %A) nounwind {
820 ; CHECK-LABEL: sqabs_8b:
822 ; CHECK-NEXT: ldr d0, [x0]
823 ; CHECK-NEXT: sqabs.8b v0, v0
825 %tmp1 = load <8 x i8>, ptr %A
826 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
830 define <16 x i8> @sqabs_16b(ptr %A) nounwind {
831 ; CHECK-LABEL: sqabs_16b:
833 ; CHECK-NEXT: ldr q0, [x0]
834 ; CHECK-NEXT: sqabs.16b v0, v0
836 %tmp1 = load <16 x i8>, ptr %A
837 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
841 define <4 x i16> @sqabs_4h(ptr %A) nounwind {
842 ; CHECK-LABEL: sqabs_4h:
844 ; CHECK-NEXT: ldr d0, [x0]
845 ; CHECK-NEXT: sqabs.4h v0, v0
847 %tmp1 = load <4 x i16>, ptr %A
848 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
852 define <8 x i16> @sqabs_8h(ptr %A) nounwind {
853 ; CHECK-LABEL: sqabs_8h:
855 ; CHECK-NEXT: ldr q0, [x0]
856 ; CHECK-NEXT: sqabs.8h v0, v0
858 %tmp1 = load <8 x i16>, ptr %A
859 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
863 define <2 x i32> @sqabs_2s(ptr %A) nounwind {
864 ; CHECK-LABEL: sqabs_2s:
866 ; CHECK-NEXT: ldr d0, [x0]
867 ; CHECK-NEXT: sqabs.2s v0, v0
869 %tmp1 = load <2 x i32>, ptr %A
870 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
874 define <4 x i32> @sqabs_4s(ptr %A) nounwind {
875 ; CHECK-LABEL: sqabs_4s:
877 ; CHECK-NEXT: ldr q0, [x0]
878 ; CHECK-NEXT: sqabs.4s v0, v0
880 %tmp1 = load <4 x i32>, ptr %A
881 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
885 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
886 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
887 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
888 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
889 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
890 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
892 define <8 x i8> @sqneg_8b(ptr %A) nounwind {
893 ; CHECK-LABEL: sqneg_8b:
895 ; CHECK-NEXT: ldr d0, [x0]
896 ; CHECK-NEXT: sqneg.8b v0, v0
898 %tmp1 = load <8 x i8>, ptr %A
899 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
903 define <16 x i8> @sqneg_16b(ptr %A) nounwind {
904 ; CHECK-LABEL: sqneg_16b:
906 ; CHECK-NEXT: ldr q0, [x0]
907 ; CHECK-NEXT: sqneg.16b v0, v0
909 %tmp1 = load <16 x i8>, ptr %A
910 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
914 define <4 x i16> @sqneg_4h(ptr %A) nounwind {
915 ; CHECK-LABEL: sqneg_4h:
917 ; CHECK-NEXT: ldr d0, [x0]
918 ; CHECK-NEXT: sqneg.4h v0, v0
920 %tmp1 = load <4 x i16>, ptr %A
921 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
925 define <8 x i16> @sqneg_8h(ptr %A) nounwind {
926 ; CHECK-LABEL: sqneg_8h:
928 ; CHECK-NEXT: ldr q0, [x0]
929 ; CHECK-NEXT: sqneg.8h v0, v0
931 %tmp1 = load <8 x i16>, ptr %A
932 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
936 define <2 x i32> @sqneg_2s(ptr %A) nounwind {
937 ; CHECK-LABEL: sqneg_2s:
939 ; CHECK-NEXT: ldr d0, [x0]
940 ; CHECK-NEXT: sqneg.2s v0, v0
942 %tmp1 = load <2 x i32>, ptr %A
943 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
947 define <4 x i32> @sqneg_4s(ptr %A) nounwind {
948 ; CHECK-LABEL: sqneg_4s:
950 ; CHECK-NEXT: ldr q0, [x0]
951 ; CHECK-NEXT: sqneg.4s v0, v0
953 %tmp1 = load <4 x i32>, ptr %A
954 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
958 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
959 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
960 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
961 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
962 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
963 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
965 define <8 x i8> @abs_8b(ptr %A) nounwind {
966 ; CHECK-LABEL: abs_8b:
968 ; CHECK-NEXT: ldr d0, [x0]
969 ; CHECK-NEXT: abs.8b v0, v0
971 %tmp1 = load <8 x i8>, ptr %A
972 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
976 define <16 x i8> @abs_16b(ptr %A) nounwind {
977 ; CHECK-LABEL: abs_16b:
979 ; CHECK-NEXT: ldr q0, [x0]
980 ; CHECK-NEXT: abs.16b v0, v0
982 %tmp1 = load <16 x i8>, ptr %A
983 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
987 define <4 x i16> @abs_4h(ptr %A) nounwind {
988 ; CHECK-LABEL: abs_4h:
990 ; CHECK-NEXT: ldr d0, [x0]
991 ; CHECK-NEXT: abs.4h v0, v0
993 %tmp1 = load <4 x i16>, ptr %A
994 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
998 define <8 x i16> @abs_8h(ptr %A) nounwind {
999 ; CHECK-LABEL: abs_8h:
1001 ; CHECK-NEXT: ldr q0, [x0]
1002 ; CHECK-NEXT: abs.8h v0, v0
1004 %tmp1 = load <8 x i16>, ptr %A
1005 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
1009 define <2 x i32> @abs_2s(ptr %A) nounwind {
1010 ; CHECK-LABEL: abs_2s:
1012 ; CHECK-NEXT: ldr d0, [x0]
1013 ; CHECK-NEXT: abs.2s v0, v0
1015 %tmp1 = load <2 x i32>, ptr %A
1016 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
1020 define <4 x i32> @abs_4s(ptr %A) nounwind {
1021 ; CHECK-LABEL: abs_4s:
1023 ; CHECK-NEXT: ldr q0, [x0]
1024 ; CHECK-NEXT: abs.4s v0, v0
1026 %tmp1 = load <4 x i32>, ptr %A
1027 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
1031 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
1032 ; CHECK-LABEL: abs_1d:
1034 ; CHECK-NEXT: abs d0, d0
1036 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
1040 define i64 @abs_1d_honestly(i64 %A) nounwind {
1041 ; CHECK-LABEL: abs_1d_honestly:
1043 ; CHECK-NEXT: fmov d0, x0
1044 ; CHECK-NEXT: abs d0, d0
1045 ; CHECK-NEXT: fmov x0, d0
1047 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
1051 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
1052 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
1053 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
1054 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
1055 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
1056 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
1057 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
1058 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
1060 define <8 x i16> @sabal8h(ptr %A, ptr %B, ptr %C) nounwind {
1061 ; CHECK-LABEL: sabal8h:
1063 ; CHECK-NEXT: ldr d1, [x0]
1064 ; CHECK-NEXT: ldr d2, [x1]
1065 ; CHECK-NEXT: ldr q0, [x2]
1066 ; CHECK-NEXT: sabal.8h v0, v1, v2
1068 %tmp1 = load <8 x i8>, ptr %A
1069 %tmp2 = load <8 x i8>, ptr %B
1070 %tmp3 = load <8 x i16>, ptr %C
1071 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1072 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1073 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1077 define <4 x i32> @sabal4s(ptr %A, ptr %B, ptr %C) nounwind {
1078 ; CHECK-LABEL: sabal4s:
1080 ; CHECK-NEXT: ldr d1, [x0]
1081 ; CHECK-NEXT: ldr d2, [x1]
1082 ; CHECK-NEXT: ldr q0, [x2]
1083 ; CHECK-NEXT: sabal.4s v0, v1, v2
1085 %tmp1 = load <4 x i16>, ptr %A
1086 %tmp2 = load <4 x i16>, ptr %B
1087 %tmp3 = load <4 x i32>, ptr %C
1088 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1089 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1090 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1094 define <2 x i64> @sabal2d(ptr %A, ptr %B, ptr %C) nounwind {
1095 ; CHECK-LABEL: sabal2d:
1097 ; CHECK-NEXT: ldr d1, [x0]
1098 ; CHECK-NEXT: ldr d2, [x1]
1099 ; CHECK-NEXT: ldr q0, [x2]
1100 ; CHECK-NEXT: sabal.2d v0, v1, v2
1102 %tmp1 = load <2 x i32>, ptr %A
1103 %tmp2 = load <2 x i32>, ptr %B
1104 %tmp3 = load <2 x i64>, ptr %C
1105 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1106 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1107 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
1108 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1112 define <8 x i16> @sabal2_8h(ptr %A, ptr %B, ptr %C) nounwind {
1113 ; CHECK-SD-LABEL: sabal2_8h:
1114 ; CHECK-SD: // %bb.0:
1115 ; CHECK-SD-NEXT: ldr q0, [x2]
1116 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1117 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1118 ; CHECK-SD-NEXT: sabal.8h v0, v1, v2
1119 ; CHECK-SD-NEXT: ret
1121 ; CHECK-GI-LABEL: sabal2_8h:
1122 ; CHECK-GI: // %bb.0:
1123 ; CHECK-GI-NEXT: ldr q1, [x0]
1124 ; CHECK-GI-NEXT: ldr q2, [x1]
1125 ; CHECK-GI-NEXT: ldr q0, [x2]
1126 ; CHECK-GI-NEXT: sabal2.8h v0, v1, v2
1127 ; CHECK-GI-NEXT: ret
1128 %load1 = load <16 x i8>, ptr %A
1129 %load2 = load <16 x i8>, ptr %B
1130 %tmp3 = load <8 x i16>, ptr %C
1131 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1132 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1133 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1134 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1135 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1139 define <4 x i32> @sabal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
1140 ; CHECK-SD-LABEL: sabal2_4s:
1141 ; CHECK-SD: // %bb.0:
1142 ; CHECK-SD-NEXT: ldr q0, [x2]
1143 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1144 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1145 ; CHECK-SD-NEXT: sabal.4s v0, v1, v2
1146 ; CHECK-SD-NEXT: ret
1148 ; CHECK-GI-LABEL: sabal2_4s:
1149 ; CHECK-GI: // %bb.0:
1150 ; CHECK-GI-NEXT: ldr q1, [x0]
1151 ; CHECK-GI-NEXT: ldr q2, [x1]
1152 ; CHECK-GI-NEXT: ldr q0, [x2]
1153 ; CHECK-GI-NEXT: sabal2.4s v0, v1, v2
1154 ; CHECK-GI-NEXT: ret
1155 %load1 = load <8 x i16>, ptr %A
1156 %load2 = load <8 x i16>, ptr %B
1157 %tmp3 = load <4 x i32>, ptr %C
1158 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1159 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1160 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1161 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1162 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1166 define <2 x i64> @sabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
1167 ; CHECK-SD-LABEL: sabal2_2d:
1168 ; CHECK-SD: // %bb.0:
1169 ; CHECK-SD-NEXT: ldr q0, [x2]
1170 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1171 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1172 ; CHECK-SD-NEXT: sabal.2d v0, v1, v2
1173 ; CHECK-SD-NEXT: ret
1175 ; CHECK-GI-LABEL: sabal2_2d:
1176 ; CHECK-GI: // %bb.0:
1177 ; CHECK-GI-NEXT: ldr q1, [x0]
1178 ; CHECK-GI-NEXT: ldr q2, [x1]
1179 ; CHECK-GI-NEXT: ldr q0, [x2]
1180 ; CHECK-GI-NEXT: sabal2.2d v0, v1, v2
1181 ; CHECK-GI-NEXT: ret
1182 %load1 = load <4 x i32>, ptr %A
1183 %load2 = load <4 x i32>, ptr %B
1184 %tmp3 = load <2 x i64>, ptr %C
1185 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1186 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1187 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1188 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1189 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1193 define <8 x i16> @uabal8h(ptr %A, ptr %B, ptr %C) nounwind {
1194 ; CHECK-LABEL: uabal8h:
1196 ; CHECK-NEXT: ldr d1, [x0]
1197 ; CHECK-NEXT: ldr d2, [x1]
1198 ; CHECK-NEXT: ldr q0, [x2]
1199 ; CHECK-NEXT: uabal.8h v0, v1, v2
1201 %tmp1 = load <8 x i8>, ptr %A
1202 %tmp2 = load <8 x i8>, ptr %B
1203 %tmp3 = load <8 x i16>, ptr %C
1204 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1205 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1206 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1210 define <4 x i32> @uabal4s(ptr %A, ptr %B, ptr %C) nounwind {
1211 ; CHECK-LABEL: uabal4s:
1213 ; CHECK-NEXT: ldr d1, [x0]
1214 ; CHECK-NEXT: ldr d2, [x1]
1215 ; CHECK-NEXT: ldr q0, [x2]
1216 ; CHECK-NEXT: uabal.4s v0, v1, v2
1218 %tmp1 = load <4 x i16>, ptr %A
1219 %tmp2 = load <4 x i16>, ptr %B
1220 %tmp3 = load <4 x i32>, ptr %C
1221 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1222 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1223 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1227 define <2 x i64> @uabal2d(ptr %A, ptr %B, ptr %C) nounwind {
1228 ; CHECK-LABEL: uabal2d:
1230 ; CHECK-NEXT: ldr d1, [x0]
1231 ; CHECK-NEXT: ldr d2, [x1]
1232 ; CHECK-NEXT: ldr q0, [x2]
1233 ; CHECK-NEXT: uabal.2d v0, v1, v2
1235 %tmp1 = load <2 x i32>, ptr %A
1236 %tmp2 = load <2 x i32>, ptr %B
1237 %tmp3 = load <2 x i64>, ptr %C
1238 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1239 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1240 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1244 define <8 x i16> @uabal2_8h(ptr %A, ptr %B, ptr %C) nounwind {
1245 ; CHECK-SD-LABEL: uabal2_8h:
1246 ; CHECK-SD: // %bb.0:
1247 ; CHECK-SD-NEXT: ldr q0, [x2]
1248 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1249 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1250 ; CHECK-SD-NEXT: uabal.8h v0, v1, v2
1251 ; CHECK-SD-NEXT: ret
1253 ; CHECK-GI-LABEL: uabal2_8h:
1254 ; CHECK-GI: // %bb.0:
1255 ; CHECK-GI-NEXT: ldr q1, [x0]
1256 ; CHECK-GI-NEXT: ldr q2, [x1]
1257 ; CHECK-GI-NEXT: ldr q0, [x2]
1258 ; CHECK-GI-NEXT: uabal2.8h v0, v1, v2
1259 ; CHECK-GI-NEXT: ret
1260 %load1 = load <16 x i8>, ptr %A
1261 %load2 = load <16 x i8>, ptr %B
1262 %tmp3 = load <8 x i16>, ptr %C
1263 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1264 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1265 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1266 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
1267 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
1271 define <4 x i32> @uabal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
1272 ; CHECK-SD-LABEL: uabal2_4s:
1273 ; CHECK-SD: // %bb.0:
1274 ; CHECK-SD-NEXT: ldr q0, [x2]
1275 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1276 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1277 ; CHECK-SD-NEXT: uabal.4s v0, v1, v2
1278 ; CHECK-SD-NEXT: ret
1280 ; CHECK-GI-LABEL: uabal2_4s:
1281 ; CHECK-GI: // %bb.0:
1282 ; CHECK-GI-NEXT: ldr q1, [x0]
1283 ; CHECK-GI-NEXT: ldr q2, [x1]
1284 ; CHECK-GI-NEXT: ldr q0, [x2]
1285 ; CHECK-GI-NEXT: uabal2.4s v0, v1, v2
1286 ; CHECK-GI-NEXT: ret
1287 %load1 = load <8 x i16>, ptr %A
1288 %load2 = load <8 x i16>, ptr %B
1289 %tmp3 = load <4 x i32>, ptr %C
1290 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1291 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1292 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1293 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
1294 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
1298 define <2 x i64> @uabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
1299 ; CHECK-SD-LABEL: uabal2_2d:
1300 ; CHECK-SD: // %bb.0:
1301 ; CHECK-SD-NEXT: ldr q0, [x2]
1302 ; CHECK-SD-NEXT: ldr d1, [x0, #8]
1303 ; CHECK-SD-NEXT: ldr d2, [x1, #8]
1304 ; CHECK-SD-NEXT: uabal.2d v0, v1, v2
1305 ; CHECK-SD-NEXT: ret
1307 ; CHECK-GI-LABEL: uabal2_2d:
1308 ; CHECK-GI: // %bb.0:
1309 ; CHECK-GI-NEXT: ldr q1, [x0]
1310 ; CHECK-GI-NEXT: ldr q2, [x1]
1311 ; CHECK-GI-NEXT: ldr q0, [x2]
1312 ; CHECK-GI-NEXT: uabal2.2d v0, v1, v2
1313 ; CHECK-GI-NEXT: ret
1314 %load1 = load <4 x i32>, ptr %A
1315 %load2 = load <4 x i32>, ptr %B
1316 %tmp3 = load <2 x i64>, ptr %C
1317 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1318 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1319 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1320 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
1321 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
1325 define <8 x i8> @saba_8b(ptr %A, ptr %B, ptr %C) nounwind {
1326 ; CHECK-LABEL: saba_8b:
1328 ; CHECK-NEXT: ldr d1, [x0]
1329 ; CHECK-NEXT: ldr d2, [x1]
1330 ; CHECK-NEXT: ldr d0, [x2]
1331 ; CHECK-NEXT: saba.8b v0, v1, v2
1333 %tmp1 = load <8 x i8>, ptr %A
1334 %tmp2 = load <8 x i8>, ptr %B
1335 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1336 %tmp4 = load <8 x i8>, ptr %C
1337 %tmp5 = add <8 x i8> %tmp3, %tmp4
1341 define <16 x i8> @saba_16b(ptr %A, ptr %B, ptr %C) nounwind {
1342 ; CHECK-LABEL: saba_16b:
1344 ; CHECK-NEXT: ldr q1, [x0]
1345 ; CHECK-NEXT: ldr q2, [x1]
1346 ; CHECK-NEXT: ldr q0, [x2]
1347 ; CHECK-NEXT: saba.16b v0, v1, v2
1349 %tmp1 = load <16 x i8>, ptr %A
1350 %tmp2 = load <16 x i8>, ptr %B
1351 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1352 %tmp4 = load <16 x i8>, ptr %C
1353 %tmp5 = add <16 x i8> %tmp3, %tmp4
1357 define <4 x i16> @saba_4h(ptr %A, ptr %B, ptr %C) nounwind {
1358 ; CHECK-LABEL: saba_4h:
1360 ; CHECK-NEXT: ldr d1, [x0]
1361 ; CHECK-NEXT: ldr d2, [x1]
1362 ; CHECK-NEXT: ldr d0, [x2]
1363 ; CHECK-NEXT: saba.4h v0, v1, v2
1365 %tmp1 = load <4 x i16>, ptr %A
1366 %tmp2 = load <4 x i16>, ptr %B
1367 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1368 %tmp4 = load <4 x i16>, ptr %C
1369 %tmp5 = add <4 x i16> %tmp3, %tmp4
1373 define <8 x i16> @saba_8h(ptr %A, ptr %B, ptr %C) nounwind {
1374 ; CHECK-LABEL: saba_8h:
1376 ; CHECK-NEXT: ldr q1, [x0]
1377 ; CHECK-NEXT: ldr q2, [x1]
1378 ; CHECK-NEXT: ldr q0, [x2]
1379 ; CHECK-NEXT: saba.8h v0, v1, v2
1381 %tmp1 = load <8 x i16>, ptr %A
1382 %tmp2 = load <8 x i16>, ptr %B
1383 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1384 %tmp4 = load <8 x i16>, ptr %C
1385 %tmp5 = add <8 x i16> %tmp3, %tmp4
1389 define <2 x i32> @saba_2s(ptr %A, ptr %B, ptr %C) nounwind {
1390 ; CHECK-LABEL: saba_2s:
1392 ; CHECK-NEXT: ldr d1, [x0]
1393 ; CHECK-NEXT: ldr d2, [x1]
1394 ; CHECK-NEXT: ldr d0, [x2]
1395 ; CHECK-NEXT: saba.2s v0, v1, v2
1397 %tmp1 = load <2 x i32>, ptr %A
1398 %tmp2 = load <2 x i32>, ptr %B
1399 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1400 %tmp4 = load <2 x i32>, ptr %C
1401 %tmp5 = add <2 x i32> %tmp3, %tmp4
1405 define <4 x i32> @saba_4s(ptr %A, ptr %B, ptr %C) nounwind {
1406 ; CHECK-LABEL: saba_4s:
1408 ; CHECK-NEXT: ldr q1, [x0]
1409 ; CHECK-NEXT: ldr q2, [x1]
1410 ; CHECK-NEXT: ldr q0, [x2]
1411 ; CHECK-NEXT: saba.4s v0, v1, v2
1413 %tmp1 = load <4 x i32>, ptr %A
1414 %tmp2 = load <4 x i32>, ptr %B
1415 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1416 %tmp4 = load <4 x i32>, ptr %C
1417 %tmp5 = add <4 x i32> %tmp3, %tmp4
1421 define <8 x i8> @uaba_8b(ptr %A, ptr %B, ptr %C) nounwind {
1422 ; CHECK-LABEL: uaba_8b:
1424 ; CHECK-NEXT: ldr d1, [x0]
1425 ; CHECK-NEXT: ldr d2, [x1]
1426 ; CHECK-NEXT: ldr d0, [x2]
1427 ; CHECK-NEXT: uaba.8b v0, v1, v2
1429 %tmp1 = load <8 x i8>, ptr %A
1430 %tmp2 = load <8 x i8>, ptr %B
1431 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
1432 %tmp4 = load <8 x i8>, ptr %C
1433 %tmp5 = add <8 x i8> %tmp3, %tmp4
1437 define <16 x i8> @uaba_16b(ptr %A, ptr %B, ptr %C) nounwind {
1438 ; CHECK-LABEL: uaba_16b:
1440 ; CHECK-NEXT: ldr q1, [x0]
1441 ; CHECK-NEXT: ldr q2, [x1]
1442 ; CHECK-NEXT: ldr q0, [x2]
1443 ; CHECK-NEXT: uaba.16b v0, v1, v2
1445 %tmp1 = load <16 x i8>, ptr %A
1446 %tmp2 = load <16 x i8>, ptr %B
1447 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
1448 %tmp4 = load <16 x i8>, ptr %C
1449 %tmp5 = add <16 x i8> %tmp3, %tmp4
1453 define <4 x i16> @uaba_4h(ptr %A, ptr %B, ptr %C) nounwind {
1454 ; CHECK-LABEL: uaba_4h:
1456 ; CHECK-NEXT: ldr d1, [x0]
1457 ; CHECK-NEXT: ldr d2, [x1]
1458 ; CHECK-NEXT: ldr d0, [x2]
1459 ; CHECK-NEXT: uaba.4h v0, v1, v2
1461 %tmp1 = load <4 x i16>, ptr %A
1462 %tmp2 = load <4 x i16>, ptr %B
1463 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
1464 %tmp4 = load <4 x i16>, ptr %C
1465 %tmp5 = add <4 x i16> %tmp3, %tmp4
1469 define <8 x i16> @uaba_8h(ptr %A, ptr %B, ptr %C) nounwind {
1470 ; CHECK-LABEL: uaba_8h:
1472 ; CHECK-NEXT: ldr q1, [x0]
1473 ; CHECK-NEXT: ldr q2, [x1]
1474 ; CHECK-NEXT: ldr q0, [x2]
1475 ; CHECK-NEXT: uaba.8h v0, v1, v2
1477 %tmp1 = load <8 x i16>, ptr %A
1478 %tmp2 = load <8 x i16>, ptr %B
1479 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
1480 %tmp4 = load <8 x i16>, ptr %C
1481 %tmp5 = add <8 x i16> %tmp3, %tmp4
1485 define <2 x i32> @uaba_2s(ptr %A, ptr %B, ptr %C) nounwind {
1486 ; CHECK-LABEL: uaba_2s:
1488 ; CHECK-NEXT: ldr d1, [x0]
1489 ; CHECK-NEXT: ldr d2, [x1]
1490 ; CHECK-NEXT: ldr d0, [x2]
1491 ; CHECK-NEXT: uaba.2s v0, v1, v2
1493 %tmp1 = load <2 x i32>, ptr %A
1494 %tmp2 = load <2 x i32>, ptr %B
1495 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
1496 %tmp4 = load <2 x i32>, ptr %C
1497 %tmp5 = add <2 x i32> %tmp3, %tmp4
1501 define <4 x i32> @uaba_4s(ptr %A, ptr %B, ptr %C) nounwind {
1502 ; CHECK-LABEL: uaba_4s:
1504 ; CHECK-NEXT: ldr q1, [x0]
1505 ; CHECK-NEXT: ldr q2, [x1]
1506 ; CHECK-NEXT: ldr q0, [x2]
1507 ; CHECK-NEXT: uaba.4s v0, v1, v2
1509 %tmp1 = load <4 x i32>, ptr %A
1510 %tmp2 = load <4 x i32>, ptr %B
1511 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
1512 %tmp4 = load <4 x i32>, ptr %C
1513 %tmp5 = add <4 x i32> %tmp3, %tmp4
1518 define float @fabds(float %a, float %b) nounwind {
1519 ; CHECK-LABEL: fabds:
1521 ; CHECK-NEXT: fabd s0, s0, s1
1523 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
1527 define double @fabdd(double %a, double %b) nounwind {
1528 ; CHECK-LABEL: fabdd:
1530 ; CHECK-NEXT: fabd d0, d0, d1
1532 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
1536 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
1537 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
1539 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
1540 ; CHECK-LABEL: fabds_from_fsub_fabs:
1542 ; CHECK-NEXT: fabd s0, s0, s1
1544 %sub = fsub float %a, %b
1545 %abs = tail call float @llvm.fabs.f32(float %sub)
1549 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
1550 ; CHECK-LABEL: fabdd_from_fsub_fabs:
1552 ; CHECK-NEXT: fabd d0, d0, d1
1554 %sub = fsub double %a, %b
1555 %abs = tail call double @llvm.fabs.f64(double %sub)
1559 declare float @llvm.fabs.f32(float) nounwind readnone
1560 declare double @llvm.fabs.f64(double) nounwind readnone
1562 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1563 ; CHECK-LABEL: uabdl_from_extract_dup:
1565 ; CHECK-NEXT: dup.2s v1, w0
1566 ; CHECK-NEXT: uabdl.2d v0, v0, v1
1568 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1569 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1570 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1571 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1572 %res1 = zext <2 x i32> %res to <2 x i64>
1576 define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1577 ; CHECK-SD-LABEL: uabdl2_from_extract_dup:
1578 ; CHECK-SD: // %bb.0:
1579 ; CHECK-SD-NEXT: dup.4s v1, w0
1580 ; CHECK-SD-NEXT: uabdl2.2d v0, v0, v1
1581 ; CHECK-SD-NEXT: ret
1583 ; CHECK-GI-LABEL: uabdl2_from_extract_dup:
1584 ; CHECK-GI: // %bb.0:
1585 ; CHECK-GI-NEXT: dup.2s v1, w0
1586 ; CHECK-GI-NEXT: mov d0, v0[1]
1587 ; CHECK-GI-NEXT: uabdl.2d v0, v0, v1
1588 ; CHECK-GI-NEXT: ret
1589 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1590 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1591 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1592 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1593 %res1 = zext <2 x i32> %res to <2 x i64>
1597 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1598 ; CHECK-LABEL: sabdl_from_extract_dup:
1600 ; CHECK-NEXT: dup.2s v1, w0
1601 ; CHECK-NEXT: sabdl.2d v0, v0, v1
1603 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1604 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1605 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1606 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1607 %res1 = zext <2 x i32> %res to <2 x i64>
1611 define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1612 ; CHECK-SD-LABEL: sabdl2_from_extract_dup:
1613 ; CHECK-SD: // %bb.0:
1614 ; CHECK-SD-NEXT: dup.4s v1, w0
1615 ; CHECK-SD-NEXT: sabdl2.2d v0, v0, v1
1616 ; CHECK-SD-NEXT: ret
1618 ; CHECK-GI-LABEL: sabdl2_from_extract_dup:
1619 ; CHECK-GI: // %bb.0:
1620 ; CHECK-GI-NEXT: dup.2s v1, w0
1621 ; CHECK-GI-NEXT: mov d0, v0[1]
1622 ; CHECK-GI-NEXT: sabdl.2d v0, v0, v1
1623 ; CHECK-GI-NEXT: ret
1624 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1625 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1626 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1627 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1628 %res1 = zext <2 x i32> %res to <2 x i64>
1632 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
1633 ; CHECK-SD-LABEL: abspattern1:
1634 ; CHECK-SD: // %bb.0:
1635 ; CHECK-SD-NEXT: abs.2s v0, v0
1636 ; CHECK-SD-NEXT: ret
1638 ; CHECK-GI-LABEL: abspattern1:
1639 ; CHECK-GI: // %bb.0:
1640 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1641 ; CHECK-GI-NEXT: neg.2s v2, v0
1642 ; CHECK-GI-NEXT: cmge.2s v1, v0, v1
1643 ; CHECK-GI-NEXT: bif.8b v0, v2, v1
1644 ; CHECK-GI-NEXT: ret
1645 %tmp1neg = sub <2 x i32> zeroinitializer, %a
1646 %b = icmp sge <2 x i32> %a, zeroinitializer
1647 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
1651 ; For GlobalISel, this generates terrible code until we can pattern match this to abs.
1652 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
1653 ; CHECK-SD-LABEL: abspattern2:
1654 ; CHECK-SD: // %bb.0:
1655 ; CHECK-SD-NEXT: abs.4h v0, v0
1656 ; CHECK-SD-NEXT: ret
1658 ; CHECK-GI-LABEL: abspattern2:
1659 ; CHECK-GI: // %bb.0:
1660 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1661 ; CHECK-GI-NEXT: neg.4h v2, v0
1662 ; CHECK-GI-NEXT: cmgt.4h v1, v0, v1
1663 ; CHECK-GI-NEXT: bif.8b v0, v2, v1
1664 ; CHECK-GI-NEXT: ret
1665 %tmp1neg = sub <4 x i16> zeroinitializer, %a
1666 %b = icmp sgt <4 x i16> %a, zeroinitializer
1667 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
1671 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
1672 ; CHECK-SD-LABEL: abspattern3:
1673 ; CHECK-SD: // %bb.0:
1674 ; CHECK-SD-NEXT: abs.8b v0, v0
1675 ; CHECK-SD-NEXT: ret
1677 ; CHECK-GI-LABEL: abspattern3:
1678 ; CHECK-GI: // %bb.0:
1679 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1680 ; CHECK-GI-NEXT: neg.8b v2, v0
1681 ; CHECK-GI-NEXT: cmgt.8b v1, v1, v0
1682 ; CHECK-GI-NEXT: bit.8b v0, v2, v1
1683 ; CHECK-GI-NEXT: ret
1684 %tmp1neg = sub <8 x i8> zeroinitializer, %a
1685 %b = icmp slt <8 x i8> %a, zeroinitializer
1686 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
1690 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
1691 ; CHECK-SD-LABEL: abspattern4:
1692 ; CHECK-SD: // %bb.0:
1693 ; CHECK-SD-NEXT: abs.4s v0, v0
1694 ; CHECK-SD-NEXT: ret
1696 ; CHECK-GI-LABEL: abspattern4:
1697 ; CHECK-GI: // %bb.0:
1698 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1699 ; CHECK-GI-NEXT: neg.4s v2, v0
1700 ; CHECK-GI-NEXT: cmge.4s v1, v0, v1
1701 ; CHECK-GI-NEXT: bif.16b v0, v2, v1
1702 ; CHECK-GI-NEXT: ret
1703 %tmp1neg = sub <4 x i32> zeroinitializer, %a
1704 %b = icmp sge <4 x i32> %a, zeroinitializer
1705 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
1709 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
1710 ; CHECK-SD-LABEL: abspattern5:
1711 ; CHECK-SD: // %bb.0:
1712 ; CHECK-SD-NEXT: abs.8h v0, v0
1713 ; CHECK-SD-NEXT: ret
1715 ; CHECK-GI-LABEL: abspattern5:
1716 ; CHECK-GI: // %bb.0:
1717 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1718 ; CHECK-GI-NEXT: neg.8h v2, v0
1719 ; CHECK-GI-NEXT: cmgt.8h v1, v0, v1
1720 ; CHECK-GI-NEXT: bif.16b v0, v2, v1
1721 ; CHECK-GI-NEXT: ret
1722 %tmp1neg = sub <8 x i16> zeroinitializer, %a
1723 %b = icmp sgt <8 x i16> %a, zeroinitializer
1724 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
1728 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
1729 ; CHECK-SD-LABEL: abspattern6:
1730 ; CHECK-SD: // %bb.0:
1731 ; CHECK-SD-NEXT: abs.16b v0, v0
1732 ; CHECK-SD-NEXT: ret
1734 ; CHECK-GI-LABEL: abspattern6:
1735 ; CHECK-GI: // %bb.0:
1736 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1737 ; CHECK-GI-NEXT: neg.16b v2, v0
1738 ; CHECK-GI-NEXT: cmgt.16b v1, v1, v0
1739 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
1740 ; CHECK-GI-NEXT: ret
1741 %tmp1neg = sub <16 x i8> zeroinitializer, %a
1742 %b = icmp slt <16 x i8> %a, zeroinitializer
1743 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
1747 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
1748 ; CHECK-SD-LABEL: abspattern7:
1749 ; CHECK-SD: // %bb.0:
1750 ; CHECK-SD-NEXT: abs.2d v0, v0
1751 ; CHECK-SD-NEXT: ret
1753 ; CHECK-GI-LABEL: abspattern7:
1754 ; CHECK-GI: // %bb.0:
1755 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1756 ; CHECK-GI-NEXT: neg.2d v2, v0
1757 ; CHECK-GI-NEXT: cmge.2d v1, v1, v0
1758 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
1759 ; CHECK-GI-NEXT: ret
1760 %tmp1neg = sub <2 x i64> zeroinitializer, %a
1761 %b = icmp sle <2 x i64> %a, zeroinitializer
1762 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
1766 define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
1767 ; CHECK-SD-LABEL: uabd_i32:
1768 ; CHECK-SD: // %bb.0:
1769 ; CHECK-SD-NEXT: sabdl.2d v0, v0, v1
1770 ; CHECK-SD-NEXT: ret
1772 ; CHECK-GI-LABEL: uabd_i32:
1773 ; CHECK-GI: // %bb.0:
1774 ; CHECK-GI-NEXT: ssubl.2d v0, v0, v1
1775 ; CHECK-GI-NEXT: movi.2d v1, #0000000000000000
1776 ; CHECK-GI-NEXT: neg.2d v2, v0
1777 ; CHECK-GI-NEXT: cmgt.2d v1, v1, v0
1778 ; CHECK-GI-NEXT: bit.16b v0, v2, v1
1779 ; CHECK-GI-NEXT: ret
1780 %aext = sext <2 x i32> %a to <2 x i64>
1781 %bext = sext <2 x i32> %b to <2 x i64>
1782 %abdiff = sub nsw <2 x i64> %aext, %bext
1783 %abcmp = icmp slt <2 x i64> %abdiff, zeroinitializer
1784 %ababs = sub nsw <2 x i64> zeroinitializer, %abdiff
1785 %absel = select <2 x i1> %abcmp, <2 x i64> %ababs, <2 x i64> %abdiff
1786 ret <2 x i64> %absel
1789 define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
1790 ; CHECK-LABEL: uabd_i64:
1792 ; CHECK-NEXT: mov.d x8, v0[1]
1793 ; CHECK-NEXT: mov.d x9, v1[1]
1794 ; CHECK-NEXT: fmov x10, d0
1795 ; CHECK-NEXT: fmov x11, d1
1796 ; CHECK-NEXT: asr x12, x10, #63
1797 ; CHECK-NEXT: asr x13, x11, #63
1798 ; CHECK-NEXT: subs x10, x10, x11
1799 ; CHECK-NEXT: asr x11, x8, #63
1800 ; CHECK-NEXT: asr x14, x9, #63
1801 ; CHECK-NEXT: sbc x12, x12, x13
1802 ; CHECK-NEXT: subs x8, x8, x9
1803 ; CHECK-NEXT: sbc x9, x11, x14
1804 ; CHECK-NEXT: asr x13, x12, #63
1805 ; CHECK-NEXT: asr x11, x9, #63
1806 ; CHECK-NEXT: eor x10, x10, x13
1807 ; CHECK-NEXT: eor x8, x8, x11
1808 ; CHECK-NEXT: eor x9, x9, x11
1809 ; CHECK-NEXT: subs x2, x8, x11
1810 ; CHECK-NEXT: eor x8, x12, x13
1811 ; CHECK-NEXT: sbc x3, x9, x11
1812 ; CHECK-NEXT: subs x9, x10, x13
1813 ; CHECK-NEXT: fmov d0, x9
1814 ; CHECK-NEXT: sbc x1, x8, x13
1815 ; CHECK-NEXT: mov.d v0[1], x1
1816 ; CHECK-NEXT: fmov x0, d0
1818 %aext = sext <2 x i64> %a to <2 x i128>
1819 %bext = sext <2 x i64> %b to <2 x i128>
1820 %abdiff = sub nsw <2 x i128> %aext, %bext
1821 %abcmp = icmp slt <2 x i128> %abdiff, zeroinitializer
1822 %ababs = sub nsw <2 x i128> zeroinitializer, %abdiff
1823 %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff
1824 ret <2 x i128> %absel