1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @vabd_v16s8(<16 x i8> %src1, <16 x i8> %src2) {
5 ; CHECK-LABEL: vabd_v16s8:
7 ; CHECK-NEXT: vabd.s8 q0, q0, q1
9 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
10 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
11 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
12 %add2 = sub <16 x i16> zeroinitializer, %add1
13 %c = icmp sge <16 x i16> %add1, zeroinitializer
14 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
15 %result = trunc <16 x i16> %s to <16 x i8>
19 define arm_aapcs_vfpcc <8 x i8> @vabd_v8s8(<8 x i8> %src1, <8 x i8> %src2) {
20 ; CHECK-LABEL: vabd_v8s8:
22 ; CHECK-NEXT: vmovlb.s8 q1, q1
23 ; CHECK-NEXT: vmovlb.s8 q0, q0
24 ; CHECK-NEXT: vabd.s16 q0, q0, q1
26 %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
27 %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
28 %add1 = sub <8 x i16> %sextsrc1, %sextsrc2
29 %add2 = sub <8 x i16> zeroinitializer, %add1
30 %c = icmp sge <8 x i16> %add1, zeroinitializer
31 %s = select <8 x i1> %c, <8 x i16> %add1, <8 x i16> %add2
32 %result = trunc <8 x i16> %s to <8 x i8>
36 define arm_aapcs_vfpcc <4 x i8> @vabd_v4s8(<4 x i8> %src1, <4 x i8> %src2) {
37 ; CHECK-LABEL: vabd_v4s8:
39 ; CHECK-NEXT: vmovlb.s8 q1, q1
40 ; CHECK-NEXT: vmovlb.s8 q0, q0
41 ; CHECK-NEXT: vmovlb.s16 q1, q1
42 ; CHECK-NEXT: vmovlb.s16 q0, q0
43 ; CHECK-NEXT: vsub.i32 q0, q0, q1
44 ; CHECK-NEXT: vabs.s32 q0, q0
46 %sextsrc1 = sext <4 x i8> %src1 to <4 x i16>
47 %sextsrc2 = sext <4 x i8> %src2 to <4 x i16>
48 %add1 = sub <4 x i16> %sextsrc1, %sextsrc2
49 %add2 = sub <4 x i16> zeroinitializer, %add1
50 %c = icmp sge <4 x i16> %add1, zeroinitializer
51 %s = select <4 x i1> %c, <4 x i16> %add1, <4 x i16> %add2
52 %result = trunc <4 x i16> %s to <4 x i8>
56 define arm_aapcs_vfpcc <8 x i16> @vabd_v8s16(<8 x i16> %src1, <8 x i16> %src2) {
57 ; CHECK-LABEL: vabd_v8s16:
59 ; CHECK-NEXT: vabd.s16 q0, q0, q1
61 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
62 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
63 %add1 = sub <8 x i32> %sextsrc1, %sextsrc2
64 %add2 = sub <8 x i32> zeroinitializer, %add1
65 %c = icmp sge <8 x i32> %add1, zeroinitializer
66 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
67 %result = trunc <8 x i32> %s to <8 x i16>
71 define arm_aapcs_vfpcc <4 x i16> @vabd_v4s16(<4 x i16> %src1, <4 x i16> %src2) {
72 ; CHECK-LABEL: vabd_v4s16:
74 ; CHECK-NEXT: vmovlb.s16 q1, q1
75 ; CHECK-NEXT: vmovlb.s16 q0, q0
76 ; CHECK-NEXT: vabd.s32 q0, q0, q1
78 %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
79 %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
80 %add1 = sub <4 x i32> %sextsrc1, %sextsrc2
81 %add2 = sub <4 x i32> zeroinitializer, %add1
82 %c = icmp sge <4 x i32> %add1, zeroinitializer
83 %s = select <4 x i1> %c, <4 x i32> %add1, <4 x i32> %add2
84 %result = trunc <4 x i32> %s to <4 x i16>
88 define arm_aapcs_vfpcc <4 x i32> @vabd_v4s32(<4 x i32> %src1, <4 x i32> %src2) {
89 ; CHECK-LABEL: vabd_v4s32:
91 ; CHECK-NEXT: vabd.s32 q0, q0, q1
93 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
94 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
95 %add1 = sub <4 x i64> %sextsrc1, %sextsrc2
96 %add2 = sub <4 x i64> zeroinitializer, %add1
97 %c = icmp sge <4 x i64> %add1, zeroinitializer
98 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
99 %result = trunc <4 x i64> %s to <4 x i32>
100 ret <4 x i32> %result
103 define arm_aapcs_vfpcc <2 x i32> @vabd_v2s32(<2 x i32> %src1, <2 x i32> %src2) {
104 ; CHECK-LABEL: vabd_v2s32:
106 ; CHECK-NEXT: vmov r0, s2
107 ; CHECK-NEXT: vmov r2, s6
108 ; CHECK-NEXT: asrs r1, r0, #31
109 ; CHECK-NEXT: subs r0, r0, r2
110 ; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
111 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31
112 ; CHECK-NEXT: eor.w r2, r1, r1, asr #31
113 ; CHECK-NEXT: subs.w r0, r0, r1, asr #31
114 ; CHECK-NEXT: sbc.w r12, r2, r1, asr #31
115 ; CHECK-NEXT: vmov r2, s0
116 ; CHECK-NEXT: vmov r1, s4
117 ; CHECK-NEXT: asrs r3, r2, #31
118 ; CHECK-NEXT: subs r2, r2, r1
119 ; CHECK-NEXT: sbc.w r1, r3, r1, asr #31
120 ; CHECK-NEXT: eor.w r2, r2, r1, asr #31
121 ; CHECK-NEXT: subs.w r2, r2, r1, asr #31
122 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
123 ; CHECK-NEXT: eor.w r0, r1, r1, asr #31
124 ; CHECK-NEXT: sbc.w r0, r0, r1, asr #31
125 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
127 %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
128 %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
129 %add1 = sub <2 x i64> %sextsrc1, %sextsrc2
130 %add2 = sub <2 x i64> zeroinitializer, %add1
131 %c = icmp sge <2 x i64> %add1, zeroinitializer
132 %s = select <2 x i1> %c, <2 x i64> %add1, <2 x i64> %add2
133 %result = trunc <2 x i64> %s to <2 x i32>
134 ret <2 x i32> %result
137 define arm_aapcs_vfpcc <16 x i8> @vabd_v16u8(<16 x i8> %src1, <16 x i8> %src2) {
138 ; CHECK-LABEL: vabd_v16u8:
140 ; CHECK-NEXT: vabd.u8 q0, q0, q1
142 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
143 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
144 %add1 = sub <16 x i16> %zextsrc1, %zextsrc2
145 %add2 = sub <16 x i16> zeroinitializer, %add1
146 %c = icmp sge <16 x i16> %add1, zeroinitializer
147 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
148 %result = trunc <16 x i16> %s to <16 x i8>
149 ret <16 x i8> %result
152 define arm_aapcs_vfpcc <8 x i8> @vabd_v8u8(<8 x i8> %src1, <8 x i8> %src2) {
153 ; CHECK-LABEL: vabd_v8u8:
155 ; CHECK-NEXT: vmovlb.u8 q1, q1
156 ; CHECK-NEXT: vmovlb.u8 q0, q0
157 ; CHECK-NEXT: vabd.u16 q0, q0, q1
159 %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
160 %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
161 %add1 = sub <8 x i16> %zextsrc1, %zextsrc2
162 %add2 = sub <8 x i16> zeroinitializer, %add1
163 %c = icmp sge <8 x i16> %add1, zeroinitializer
164 %s = select <8 x i1> %c, <8 x i16> %add1, <8 x i16> %add2
165 %result = trunc <8 x i16> %s to <8 x i8>
169 define arm_aapcs_vfpcc <4 x i8> @vabd_v4u8(<4 x i8> %src1, <4 x i8> %src2) {
170 ; CHECK-LABEL: vabd_v4u8:
172 ; CHECK-NEXT: vmov.i32 q2, #0xff
173 ; CHECK-NEXT: vand q1, q1, q2
174 ; CHECK-NEXT: vand q0, q0, q2
175 ; CHECK-NEXT: vsub.i32 q0, q0, q1
176 ; CHECK-NEXT: vabs.s32 q0, q0
178 %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
179 %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
180 %add1 = sub <4 x i16> %zextsrc1, %zextsrc2
181 %add2 = sub <4 x i16> zeroinitializer, %add1
182 %c = icmp sge <4 x i16> %add1, zeroinitializer
183 %s = select <4 x i1> %c, <4 x i16> %add1, <4 x i16> %add2
184 %result = trunc <4 x i16> %s to <4 x i8>
188 define arm_aapcs_vfpcc <8 x i16> @vabd_v8u16(<8 x i16> %src1, <8 x i16> %src2) {
189 ; CHECK-LABEL: vabd_v8u16:
191 ; CHECK-NEXT: vabd.u16 q0, q0, q1
193 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
194 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
195 %add1 = sub <8 x i32> %zextsrc1, %zextsrc2
196 %add2 = sub <8 x i32> zeroinitializer, %add1
197 %c = icmp sge <8 x i32> %add1, zeroinitializer
198 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
199 %result = trunc <8 x i32> %s to <8 x i16>
200 ret <8 x i16> %result
203 define arm_aapcs_vfpcc <4 x i16> @vabd_v4u16(<4 x i16> %src1, <4 x i16> %src2) {
204 ; CHECK-LABEL: vabd_v4u16:
206 ; CHECK-NEXT: vmovlb.u16 q1, q1
207 ; CHECK-NEXT: vmovlb.u16 q0, q0
208 ; CHECK-NEXT: vabd.u32 q0, q0, q1
210 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
211 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
212 %add1 = sub <4 x i32> %zextsrc1, %zextsrc2
213 %add2 = sub <4 x i32> zeroinitializer, %add1
214 %c = icmp sge <4 x i32> %add1, zeroinitializer
215 %s = select <4 x i1> %c, <4 x i32> %add1, <4 x i32> %add2
216 %result = trunc <4 x i32> %s to <4 x i16>
217 ret <4 x i16> %result
220 define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
221 ; CHECK-LABEL: vabd_u32:
223 ; CHECK-NEXT: vabd.u32 q0, q0, q1
225 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
226 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
227 %add1 = sub <4 x i64> %zextsrc1, %zextsrc2
228 %add2 = sub <4 x i64> zeroinitializer, %add1
229 %c = icmp sge <4 x i64> %add1, zeroinitializer
230 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
231 %result = trunc <4 x i64> %s to <4 x i32>
232 ret <4 x i32> %result
235 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32(<4 x i32> %src1, <4 x i32> %src2) {
236 ; CHECK-LABEL: vabd_v4u32:
238 ; CHECK-NEXT: vabd.u32 q0, q0, q1
240 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
241 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
242 %add1 = sub <4 x i64> %zextsrc1, %zextsrc2
243 %add2 = sub <4 x i64> zeroinitializer, %add1
244 %c = icmp sge <4 x i64> %add1, zeroinitializer
245 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
246 %result = trunc <4 x i64> %s to <4 x i32>
247 ret <4 x i32> %result
250 define arm_aapcs_vfpcc <2 x i32> @vabd_v2u32(<2 x i32> %src1, <2 x i32> %src2) {
251 ; CHECK-LABEL: vabd_v2u32:
253 ; CHECK-NEXT: .save {r7, lr}
254 ; CHECK-NEXT: push {r7, lr}
255 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff
256 ; CHECK-NEXT: vand q1, q1, q2
257 ; CHECK-NEXT: vand q0, q0, q2
258 ; CHECK-NEXT: vmov r0, r1, d3
259 ; CHECK-NEXT: vmov r2, r3, d1
260 ; CHECK-NEXT: subs r0, r2, r0
261 ; CHECK-NEXT: sbc.w r1, r3, r1
262 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31
263 ; CHECK-NEXT: eor.w r2, r1, r1, asr #31
264 ; CHECK-NEXT: subs.w lr, r0, r1, asr #31
265 ; CHECK-NEXT: sbc.w r12, r2, r1, asr #31
266 ; CHECK-NEXT: vmov r2, r3, d2
267 ; CHECK-NEXT: vmov r1, r0, d0
268 ; CHECK-NEXT: subs r1, r1, r2
269 ; CHECK-NEXT: sbcs r0, r3
270 ; CHECK-NEXT: eor.w r1, r1, r0, asr #31
271 ; CHECK-NEXT: subs.w r1, r1, r0, asr #31
272 ; CHECK-NEXT: vmov q0[2], q0[0], r1, lr
273 ; CHECK-NEXT: eor.w r1, r0, r0, asr #31
274 ; CHECK-NEXT: sbc.w r0, r1, r0, asr #31
275 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
276 ; CHECK-NEXT: pop {r7, pc}
277 %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
278 %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
279 %add1 = sub <2 x i64> %zextsrc1, %zextsrc2
280 %add2 = sub <2 x i64> zeroinitializer, %add1
281 %c = icmp sge <2 x i64> %add1, zeroinitializer
282 %s = select <2 x i1> %c, <2 x i64> %add1, <2 x i64> %add2
283 %result = trunc <2 x i64> %s to <2 x i32>
284 ret <2 x i32> %result
287 define void @vabd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
288 ; CHECK-LABEL: vabd_loop_s8:
289 ; CHECK: @ %bb.0: @ %entry
290 ; CHECK-NEXT: .save {r7, lr}
291 ; CHECK-NEXT: push {r7, lr}
292 ; CHECK-NEXT: mov.w lr, #64
293 ; CHECK-NEXT: .LBB15_1: @ %vector.body
294 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
295 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
296 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
297 ; CHECK-NEXT: vabd.s8 q0, q1, q0
298 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
299 ; CHECK-NEXT: le lr, .LBB15_1
300 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
301 ; CHECK-NEXT: pop {r7, pc}
303 br label %vector.body
305 vector.body: ; preds = %vector.body, %entry
306 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
307 %0 = getelementptr inbounds i8, ptr %x, i32 %index
308 %wide.load = load <16 x i8>, ptr %0, align 1
309 %1 = sext <16 x i8> %wide.load to <16 x i32>
310 %2 = getelementptr inbounds i8, ptr %y, i32 %index
311 %wide.load22 = load <16 x i8>, ptr %2, align 1
312 %3 = sext <16 x i8> %wide.load22 to <16 x i32>
313 %4 = sub nsw <16 x i32> %1, %3
314 %5 = icmp slt <16 x i32> %4, zeroinitializer
315 %6 = sub nsw <16 x i32> zeroinitializer, %4
316 %7 = select <16 x i1> %5, <16 x i32> %6, <16 x i32> %4
317 %8 = trunc <16 x i32> %7 to <16 x i8>
318 %9 = getelementptr inbounds i8, ptr %z, i32 %index
319 store <16 x i8> %8, ptr %9, align 1
320 %index.next = add i32 %index, 16
321 %10 = icmp eq i32 %index.next, 1024
322 br i1 %10, label %for.cond.cleanup, label %vector.body
324 for.cond.cleanup: ; preds = %vector.body
328 define void @vabd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
329 ; CHECK-LABEL: vabd_loop_s16:
330 ; CHECK: @ %bb.0: @ %entry
331 ; CHECK-NEXT: .save {r7, lr}
332 ; CHECK-NEXT: push {r7, lr}
333 ; CHECK-NEXT: mov.w lr, #128
334 ; CHECK-NEXT: .LBB16_1: @ %vector.body
335 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
336 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
337 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
338 ; CHECK-NEXT: vabd.s16 q0, q1, q0
339 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
340 ; CHECK-NEXT: le lr, .LBB16_1
341 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
342 ; CHECK-NEXT: pop {r7, pc}
344 br label %vector.body
346 vector.body: ; preds = %vector.body, %entry
347 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
348 %0 = getelementptr inbounds i16, ptr %x, i32 %index
349 %wide.load = load <8 x i16>, ptr %0, align 2
350 %1 = sext <8 x i16> %wide.load to <8 x i32>
351 %2 = getelementptr inbounds i16, ptr %y, i32 %index
352 %wide.load22 = load <8 x i16>, ptr %2, align 2
353 %3 = sext <8 x i16> %wide.load22 to <8 x i32>
354 %4 = sub nsw <8 x i32> %1, %3
355 %5 = icmp slt <8 x i32> %4, zeroinitializer
356 %6 = sub nsw <8 x i32> zeroinitializer, %4
357 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
358 %8 = trunc <8 x i32> %7 to <8 x i16>
359 %9 = getelementptr inbounds i16, ptr %z, i32 %index
360 store <8 x i16> %8, ptr %9, align 2
361 %index.next = add i32 %index, 8
362 %10 = icmp eq i32 %index.next, 1024
363 br i1 %10, label %for.cond.cleanup, label %vector.body
365 for.cond.cleanup: ; preds = %vector.body
369 define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
370 ; CHECK-LABEL: vabd_loop_s32:
371 ; CHECK: @ %bb.0: @ %entry
372 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
373 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
374 ; CHECK-NEXT: mov.w lr, #256
375 ; CHECK-NEXT: vmov.i32 q0, #0x0
376 ; CHECK-NEXT: .LBB17_1: @ %vector.body
377 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
378 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
379 ; CHECK-NEXT: vmov.f32 s8, s6
380 ; CHECK-NEXT: vmov r7, s4
381 ; CHECK-NEXT: vmov.f32 s6, s7
382 ; CHECK-NEXT: vmov r3, s8
383 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
384 ; CHECK-NEXT: vmov.f32 s12, s10
385 ; CHECK-NEXT: vmov.f32 s10, s5
386 ; CHECK-NEXT: vmov.f32 s14, s11
387 ; CHECK-NEXT: vmov r4, s12
388 ; CHECK-NEXT: asr.w r12, r3, #31
389 ; CHECK-NEXT: subs.w r8, r3, r4
390 ; CHECK-NEXT: sbc.w r12, r12, r4, asr #31
391 ; CHECK-NEXT: vmov r4, s10
392 ; CHECK-NEXT: vmov.f32 s10, s9
393 ; CHECK-NEXT: vmov r6, s10
394 ; CHECK-NEXT: asrs r3, r4, #31
395 ; CHECK-NEXT: subs r4, r4, r6
396 ; CHECK-NEXT: sbc.w r9, r3, r6, asr #31
397 ; CHECK-NEXT: vmov r6, s8
398 ; CHECK-NEXT: vmov r3, s6
399 ; CHECK-NEXT: subs r5, r7, r6
400 ; CHECK-NEXT: asr.w r7, r7, #31
401 ; CHECK-NEXT: vmov q2[2], q2[0], r5, r8
402 ; CHECK-NEXT: vmov r5, s14
403 ; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
404 ; CHECK-NEXT: asrs r6, r6, #31
405 ; CHECK-NEXT: subs r7, r3, r5
406 ; CHECK-NEXT: asr.w r3, r3, #31
407 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r7
408 ; CHECK-NEXT: mov.w r7, #0
409 ; CHECK-NEXT: sbc.w r3, r3, r5, asr #31
410 ; CHECK-NEXT: bfi r7, r6, #0, #4
411 ; CHECK-NEXT: asr.w r4, r9, #31
412 ; CHECK-NEXT: asr.w r6, r12, #31
413 ; CHECK-NEXT: bfi r7, r4, #4, #4
414 ; CHECK-NEXT: asrs r3, r3, #31
415 ; CHECK-NEXT: bfi r7, r6, #8, #4
416 ; CHECK-NEXT: bfi r7, r3, #12, #4
417 ; CHECK-NEXT: vmsr p0, r7
419 ; CHECK-NEXT: vsubt.i32 q2, q0, q2
420 ; CHECK-NEXT: vstrb.8 q2, [r2], #16
421 ; CHECK-NEXT: le lr, .LBB17_1
422 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
423 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
425 br label %vector.body
427 vector.body: ; preds = %vector.body, %entry
428 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
429 %0 = getelementptr inbounds i32, ptr %x, i32 %index
430 %wide.load = load <4 x i32>, ptr %0, align 4
431 %1 = sext <4 x i32> %wide.load to <4 x i64>
432 %2 = getelementptr inbounds i32, ptr %y, i32 %index
433 %wide.load23 = load <4 x i32>, ptr %2, align 4
434 %3 = sext <4 x i32> %wide.load23 to <4 x i64>
435 %4 = sub nsw <4 x i64> %1, %3
436 %5 = icmp slt <4 x i64> %4, zeroinitializer
437 %6 = trunc <4 x i64> %4 to <4 x i32>
438 %7 = sub <4 x i32> zeroinitializer, %6
439 %8 = select <4 x i1> %5, <4 x i32> %7, <4 x i32> %6
440 %9 = getelementptr inbounds i32, ptr %z, i32 %index
441 store <4 x i32> %8, ptr %9, align 4
442 %index.next = add i32 %index, 4
443 %10 = icmp eq i32 %index.next, 1024
444 br i1 %10, label %for.cond.cleanup, label %vector.body
446 for.cond.cleanup: ; preds = %vector.body
450 define void @vabd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
451 ; CHECK-LABEL: vabd_loop_u8:
452 ; CHECK: @ %bb.0: @ %entry
453 ; CHECK-NEXT: .save {r7, lr}
454 ; CHECK-NEXT: push {r7, lr}
455 ; CHECK-NEXT: mov.w lr, #64
456 ; CHECK-NEXT: .LBB18_1: @ %vector.body
457 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
458 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
459 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
460 ; CHECK-NEXT: vabd.u8 q0, q1, q0
461 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
462 ; CHECK-NEXT: le lr, .LBB18_1
463 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
464 ; CHECK-NEXT: pop {r7, pc}
466 br label %vector.body
468 vector.body: ; preds = %vector.body, %entry
469 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
470 %0 = getelementptr inbounds i8, ptr %x, i32 %index
471 %wide.load = load <16 x i8>, ptr %0, align 1
472 %1 = zext <16 x i8> %wide.load to <16 x i32>
473 %2 = getelementptr inbounds i8, ptr %y, i32 %index
474 %wide.load22 = load <16 x i8>, ptr %2, align 1
475 %3 = zext <16 x i8> %wide.load22 to <16 x i32>
476 %4 = sub nsw <16 x i32> %1, %3
477 %5 = icmp slt <16 x i32> %4, zeroinitializer
478 %6 = sub nsw <16 x i32> zeroinitializer, %4
479 %7 = select <16 x i1> %5, <16 x i32> %6, <16 x i32> %4
480 %8 = trunc <16 x i32> %7 to <16 x i8>
481 %9 = getelementptr inbounds i8, ptr %z, i32 %index
482 store <16 x i8> %8, ptr %9, align 1
483 %index.next = add i32 %index, 16
484 %10 = icmp eq i32 %index.next, 1024
485 br i1 %10, label %for.cond.cleanup, label %vector.body
487 for.cond.cleanup: ; preds = %vector.body
491 define void @vabd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
492 ; CHECK-LABEL: vabd_loop_u16:
493 ; CHECK: @ %bb.0: @ %entry
494 ; CHECK-NEXT: .save {r7, lr}
495 ; CHECK-NEXT: push {r7, lr}
496 ; CHECK-NEXT: mov.w lr, #128
497 ; CHECK-NEXT: .LBB19_1: @ %vector.body
498 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
499 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
500 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
501 ; CHECK-NEXT: vabd.u16 q0, q1, q0
502 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
503 ; CHECK-NEXT: le lr, .LBB19_1
504 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
505 ; CHECK-NEXT: pop {r7, pc}
507 br label %vector.body
509 vector.body: ; preds = %vector.body, %entry
510 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
511 %0 = getelementptr inbounds i16, ptr %x, i32 %index
512 %wide.load = load <8 x i16>, ptr %0, align 2
513 %1 = zext <8 x i16> %wide.load to <8 x i32>
514 %2 = getelementptr inbounds i16, ptr %y, i32 %index
515 %wide.load22 = load <8 x i16>, ptr %2, align 2
516 %3 = zext <8 x i16> %wide.load22 to <8 x i32>
517 %4 = sub nsw <8 x i32> %1, %3
518 %5 = icmp slt <8 x i32> %4, zeroinitializer
519 %6 = sub nsw <8 x i32> zeroinitializer, %4
520 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
521 %8 = trunc <8 x i32> %7 to <8 x i16>
522 %9 = getelementptr inbounds i16, ptr %z, i32 %index
523 store <8 x i16> %8, ptr %9, align 2
524 %index.next = add i32 %index, 8
525 %10 = icmp eq i32 %index.next, 1024
526 br i1 %10, label %for.cond.cleanup, label %vector.body
528 for.cond.cleanup: ; preds = %vector.body
532 define void @vabd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
533 ; CHECK-LABEL: vabd_loop_u32:
534 ; CHECK: @ %bb.0: @ %entry
535 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
536 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
537 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
538 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
539 ; CHECK-NEXT: mov.w lr, #256
540 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
541 ; CHECK-NEXT: vmov.i32 q1, #0x0
542 ; CHECK-NEXT: .LBB20_1: @ %vector.body
543 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
544 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16
545 ; CHECK-NEXT: vldrw.u32 q5, [r0], #16
546 ; CHECK-NEXT: vmov.f32 s8, s18
547 ; CHECK-NEXT: vmov.f32 s10, s19
548 ; CHECK-NEXT: vmov.f32 s12, s22
549 ; CHECK-NEXT: vand q2, q2, q0
550 ; CHECK-NEXT: vmov.f32 s14, s23
551 ; CHECK-NEXT: vand q3, q3, q0
552 ; CHECK-NEXT: vmov r3, r12, d4
553 ; CHECK-NEXT: vmov r4, r5, d6
554 ; CHECK-NEXT: vmov.f32 s18, s17
555 ; CHECK-NEXT: vmov.f32 s22, s21
556 ; CHECK-NEXT: vand q4, q4, q0
557 ; CHECK-NEXT: vand q5, q5, q0
558 ; CHECK-NEXT: vmov r6, r7, d11
559 ; CHECK-NEXT: subs.w r8, r4, r3
560 ; CHECK-NEXT: sbc.w r12, r5, r12
561 ; CHECK-NEXT: vmov r5, r3, d9
562 ; CHECK-NEXT: subs.w r10, r6, r5
563 ; CHECK-NEXT: sbc.w r9, r7, r3
564 ; CHECK-NEXT: vmov r6, r7, d8
565 ; CHECK-NEXT: vmov r4, r3, d10
566 ; CHECK-NEXT: subs r4, r4, r6
567 ; CHECK-NEXT: sbcs r3, r7
568 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r8
569 ; CHECK-NEXT: vmov r4, r6, d5
570 ; CHECK-NEXT: vmov r7, r5, d7
571 ; CHECK-NEXT: asrs r3, r3, #31
572 ; CHECK-NEXT: subs r4, r7, r4
573 ; CHECK-NEXT: vmov q4[3], q4[1], r10, r4
574 ; CHECK-NEXT: mov.w r4, #0
575 ; CHECK-NEXT: bfi r4, r3, #0, #4
576 ; CHECK-NEXT: asr.w r3, r9, #31
577 ; CHECK-NEXT: bfi r4, r3, #4, #4
578 ; CHECK-NEXT: asr.w r3, r12, #31
579 ; CHECK-NEXT: bfi r4, r3, #8, #4
580 ; CHECK-NEXT: sbc.w r3, r5, r6
581 ; CHECK-NEXT: asrs r3, r3, #31
582 ; CHECK-NEXT: bfi r4, r3, #12, #4
583 ; CHECK-NEXT: vmsr p0, r4
585 ; CHECK-NEXT: vsubt.i32 q4, q1, q4
586 ; CHECK-NEXT: vstrb.8 q4, [r2], #16
587 ; CHECK-NEXT: le lr, .LBB20_1
588 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
589 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
590 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
592 br label %vector.body
594 vector.body: ; preds = %vector.body, %entry
595 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
596 %0 = getelementptr inbounds i32, ptr %x, i32 %index
597 %wide.load = load <4 x i32>, ptr %0, align 4
598 %1 = zext <4 x i32> %wide.load to <4 x i64>
599 %2 = getelementptr inbounds i32, ptr %y, i32 %index
600 %wide.load23 = load <4 x i32>, ptr %2, align 4
601 %3 = zext <4 x i32> %wide.load23 to <4 x i64>
602 %4 = sub nsw <4 x i64> %1, %3
603 %5 = icmp slt <4 x i64> %4, zeroinitializer
604 %6 = trunc <4 x i64> %4 to <4 x i32>
605 %7 = sub <4 x i32> zeroinitializer, %6
606 %8 = select <4 x i1> %5, <4 x i32> %7, <4 x i32> %6
607 %9 = getelementptr inbounds i32, ptr %z, i32 %index
608 store <4 x i32> %8, ptr %9, align 4
609 %index.next = add i32 %index, 4
610 %10 = icmp eq i32 %index.next, 1024
611 br i1 %10, label %for.cond.cleanup, label %vector.body
613 for.cond.cleanup: ; preds = %vector.body
617 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_commutative(<4 x i32> %src1, <4 x i32> %src2) {
618 ; CHECK-LABEL: vabd_v4u32_commutative:
620 ; CHECK-NEXT: vabd.u32 q0, q1, q0
621 ; CHECK-NEXT: vadd.i32 q0, q0, q0
623 %azextsrc1 = zext <4 x i32> %src1 to <4 x i64>
624 %azextsrc2 = zext <4 x i32> %src2 to <4 x i64>
625 %aadd1 = sub <4 x i64> %azextsrc1, %azextsrc2
626 %aadd2 = sub <4 x i64> zeroinitializer, %aadd1
627 %ac = icmp sge <4 x i64> %aadd1, zeroinitializer
628 %as = select <4 x i1> %ac, <4 x i64> %aadd1, <4 x i64> %aadd2
629 %aresult = trunc <4 x i64> %as to <4 x i32>
630 %bzextsrc1 = zext <4 x i32> %src2 to <4 x i64>
631 %bzextsrc2 = zext <4 x i32> %src1 to <4 x i64>
632 %badd1 = sub <4 x i64> %bzextsrc1, %bzextsrc2
633 %badd2 = sub <4 x i64> zeroinitializer, %badd1
634 %bc = icmp sge <4 x i64> %badd1, zeroinitializer
635 %bs = select <4 x i1> %bc, <4 x i64> %badd1, <4 x i64> %badd2
636 %bresult = trunc <4 x i64> %bs to <4 x i32>
637 %r = add <4 x i32> %aresult, %bresult
641 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_shuffle(<4 x i32> %src1, <4 x i32> %src2) {
642 ; CHECK-LABEL: vabd_v4u32_shuffle:
644 ; CHECK-NEXT: vabd.u32 q1, q0, q1
645 ; CHECK-NEXT: vmov.f32 s0, s7
646 ; CHECK-NEXT: vmov.f32 s1, s6
647 ; CHECK-NEXT: vmov.f32 s2, s5
648 ; CHECK-NEXT: vmov.f32 s3, s4
650 %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
651 %s2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
652 %azextsrc1 = zext <4 x i32> %s1 to <4 x i64>
653 %azextsrc2 = zext <4 x i32> %s2 to <4 x i64>
654 %aadd1 = sub <4 x i64> %azextsrc1, %azextsrc2
655 %aadd2 = sub <4 x i64> zeroinitializer, %aadd1
656 %ac = icmp sge <4 x i64> %aadd1, zeroinitializer
657 %as = select <4 x i1> %ac, <4 x i64> %aadd1, <4 x i64> %aadd2
658 %aresult = trunc <4 x i64> %as to <4 x i32>
659 ret <4 x i32> %aresult
663 define arm_aapcs_vfpcc i16 @vabds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
664 ; CHECK-LABEL: vabds_reduce_v16i8:
665 ; CHECK: @ %bb.0: @ %entry
666 ; CHECK-NEXT: vabd.s8 q0, q0, q1
667 ; CHECK-NEXT: vaddv.u8 r0, q0
670 %sextsrc1 = sext <16 x i8> %s0 to <16 x i16>
671 %sextsrc2 = sext <16 x i8> %s1 to <16 x i16>
672 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
673 %add2 = sub <16 x i16> zeroinitializer, %add1
674 %c = icmp sge <16 x i16> %add1, zeroinitializer
675 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
676 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
680 define arm_aapcs_vfpcc i16 @vabdu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
681 ; CHECK-LABEL: vabdu_reduce_v16i8:
682 ; CHECK: @ %bb.0: @ %entry
683 ; CHECK-NEXT: vabd.u8 q0, q0, q1
684 ; CHECK-NEXT: vaddv.u8 r0, q0
687 %sextsrc1 = zext <16 x i8> %s0 to <16 x i16>
688 %sextsrc2 = zext <16 x i8> %s1 to <16 x i16>
689 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
690 %add2 = sub <16 x i16> zeroinitializer, %add1
691 %c = icmp sge <16 x i16> %add1, zeroinitializer
692 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
693 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
697 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)