1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @vabd_v16s8(<16 x i8> %src1, <16 x i8> %src2) {
5 ; CHECK-LABEL: vabd_v16s8:
7 ; CHECK-NEXT: vabd.s8 q0, q0, q1
9 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
10 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
11 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
12 %add2 = sub <16 x i16> zeroinitializer, %add1
13 %c = icmp sge <16 x i16> %add1, zeroinitializer
14 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
15 %result = trunc <16 x i16> %s to <16 x i8>
19 define arm_aapcs_vfpcc <8 x i8> @vabd_v8s8(<8 x i8> %src1, <8 x i8> %src2) {
20 ; CHECK-LABEL: vabd_v8s8:
22 ; CHECK-NEXT: vmovlb.s8 q1, q1
23 ; CHECK-NEXT: vmovlb.s8 q0, q0
24 ; CHECK-NEXT: vabd.s16 q0, q0, q1
26 %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
27 %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
28 %add1 = sub <8 x i16> %sextsrc1, %sextsrc2
29 %add2 = sub <8 x i16> zeroinitializer, %add1
30 %c = icmp sge <8 x i16> %add1, zeroinitializer
31 %s = select <8 x i1> %c, <8 x i16> %add1, <8 x i16> %add2
32 %result = trunc <8 x i16> %s to <8 x i8>
36 define arm_aapcs_vfpcc <4 x i8> @vabd_v4s8(<4 x i8> %src1, <4 x i8> %src2) {
37 ; CHECK-LABEL: vabd_v4s8:
39 ; CHECK-NEXT: vmovlb.s8 q1, q1
40 ; CHECK-NEXT: vmovlb.s8 q0, q0
41 ; CHECK-NEXT: vmovlb.s16 q1, q1
42 ; CHECK-NEXT: vmovlb.s16 q0, q0
43 ; CHECK-NEXT: vabd.s32 q0, q0, q1
45 %sextsrc1 = sext <4 x i8> %src1 to <4 x i16>
46 %sextsrc2 = sext <4 x i8> %src2 to <4 x i16>
47 %add1 = sub <4 x i16> %sextsrc1, %sextsrc2
48 %add2 = sub <4 x i16> zeroinitializer, %add1
49 %c = icmp sge <4 x i16> %add1, zeroinitializer
50 %s = select <4 x i1> %c, <4 x i16> %add1, <4 x i16> %add2
51 %result = trunc <4 x i16> %s to <4 x i8>
55 define arm_aapcs_vfpcc <8 x i16> @vabd_v8s16(<8 x i16> %src1, <8 x i16> %src2) {
56 ; CHECK-LABEL: vabd_v8s16:
58 ; CHECK-NEXT: vabd.s16 q0, q0, q1
60 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
61 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
62 %add1 = sub <8 x i32> %sextsrc1, %sextsrc2
63 %add2 = sub <8 x i32> zeroinitializer, %add1
64 %c = icmp sge <8 x i32> %add1, zeroinitializer
65 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
66 %result = trunc <8 x i32> %s to <8 x i16>
70 define arm_aapcs_vfpcc <4 x i16> @vabd_v4s16(<4 x i16> %src1, <4 x i16> %src2) {
71 ; CHECK-LABEL: vabd_v4s16:
73 ; CHECK-NEXT: vmovlb.s16 q1, q1
74 ; CHECK-NEXT: vmovlb.s16 q0, q0
75 ; CHECK-NEXT: vabd.s32 q0, q0, q1
77 %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
78 %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
79 %add1 = sub <4 x i32> %sextsrc1, %sextsrc2
80 %add2 = sub <4 x i32> zeroinitializer, %add1
81 %c = icmp sge <4 x i32> %add1, zeroinitializer
82 %s = select <4 x i1> %c, <4 x i32> %add1, <4 x i32> %add2
83 %result = trunc <4 x i32> %s to <4 x i16>
87 define arm_aapcs_vfpcc <4 x i32> @vabd_v4s32(<4 x i32> %src1, <4 x i32> %src2) {
88 ; CHECK-LABEL: vabd_v4s32:
90 ; CHECK-NEXT: vabd.s32 q0, q0, q1
92 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
93 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
94 %add1 = sub <4 x i64> %sextsrc1, %sextsrc2
95 %add2 = sub <4 x i64> zeroinitializer, %add1
96 %c = icmp sge <4 x i64> %add1, zeroinitializer
97 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
98 %result = trunc <4 x i64> %s to <4 x i32>
102 define arm_aapcs_vfpcc <2 x i32> @vabd_v2s32(<2 x i32> %src1, <2 x i32> %src2) {
103 ; CHECK-LABEL: vabd_v2s32:
105 ; CHECK-NEXT: vmov r0, s2
106 ; CHECK-NEXT: vmov r2, s6
107 ; CHECK-NEXT: asrs r1, r0, #31
108 ; CHECK-NEXT: subs r0, r0, r2
109 ; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
110 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31
111 ; CHECK-NEXT: eor.w r2, r1, r1, asr #31
112 ; CHECK-NEXT: subs.w r0, r0, r1, asr #31
113 ; CHECK-NEXT: sbc.w r12, r2, r1, asr #31
114 ; CHECK-NEXT: vmov r2, s0
115 ; CHECK-NEXT: vmov r1, s4
116 ; CHECK-NEXT: asrs r3, r2, #31
117 ; CHECK-NEXT: subs r2, r2, r1
118 ; CHECK-NEXT: sbc.w r1, r3, r1, asr #31
119 ; CHECK-NEXT: eor.w r2, r2, r1, asr #31
120 ; CHECK-NEXT: subs.w r2, r2, r1, asr #31
121 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
122 ; CHECK-NEXT: eor.w r0, r1, r1, asr #31
123 ; CHECK-NEXT: sbc.w r0, r0, r1, asr #31
124 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
126 %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
127 %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
128 %add1 = sub <2 x i64> %sextsrc1, %sextsrc2
129 %add2 = sub <2 x i64> zeroinitializer, %add1
130 %c = icmp sge <2 x i64> %add1, zeroinitializer
131 %s = select <2 x i1> %c, <2 x i64> %add1, <2 x i64> %add2
132 %result = trunc <2 x i64> %s to <2 x i32>
133 ret <2 x i32> %result
136 define arm_aapcs_vfpcc <16 x i8> @vabd_v16u8(<16 x i8> %src1, <16 x i8> %src2) {
137 ; CHECK-LABEL: vabd_v16u8:
139 ; CHECK-NEXT: vabd.u8 q0, q0, q1
141 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
142 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
143 %add1 = sub <16 x i16> %zextsrc1, %zextsrc2
144 %add2 = sub <16 x i16> zeroinitializer, %add1
145 %c = icmp sge <16 x i16> %add1, zeroinitializer
146 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
147 %result = trunc <16 x i16> %s to <16 x i8>
148 ret <16 x i8> %result
151 define arm_aapcs_vfpcc <8 x i8> @vabd_v8u8(<8 x i8> %src1, <8 x i8> %src2) {
152 ; CHECK-LABEL: vabd_v8u8:
154 ; CHECK-NEXT: vmovlb.u8 q1, q1
155 ; CHECK-NEXT: vmovlb.u8 q0, q0
156 ; CHECK-NEXT: vabd.u16 q0, q0, q1
158 %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
159 %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
160 %add1 = sub <8 x i16> %zextsrc1, %zextsrc2
161 %add2 = sub <8 x i16> zeroinitializer, %add1
162 %c = icmp sge <8 x i16> %add1, zeroinitializer
163 %s = select <8 x i1> %c, <8 x i16> %add1, <8 x i16> %add2
164 %result = trunc <8 x i16> %s to <8 x i8>
168 define arm_aapcs_vfpcc <4 x i8> @vabd_v4u8(<4 x i8> %src1, <4 x i8> %src2) {
169 ; CHECK-LABEL: vabd_v4u8:
171 ; CHECK-NEXT: vmov.i32 q2, #0xff
172 ; CHECK-NEXT: vand q1, q1, q2
173 ; CHECK-NEXT: vand q0, q0, q2
174 ; CHECK-NEXT: vsub.i32 q0, q0, q1
175 ; CHECK-NEXT: vabs.s32 q0, q0
177 %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
178 %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
179 %add1 = sub <4 x i16> %zextsrc1, %zextsrc2
180 %add2 = sub <4 x i16> zeroinitializer, %add1
181 %c = icmp sge <4 x i16> %add1, zeroinitializer
182 %s = select <4 x i1> %c, <4 x i16> %add1, <4 x i16> %add2
183 %result = trunc <4 x i16> %s to <4 x i8>
187 define arm_aapcs_vfpcc <8 x i16> @vabd_v8u16(<8 x i16> %src1, <8 x i16> %src2) {
188 ; CHECK-LABEL: vabd_v8u16:
190 ; CHECK-NEXT: vabd.u16 q0, q0, q1
192 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
193 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
194 %add1 = sub <8 x i32> %zextsrc1, %zextsrc2
195 %add2 = sub <8 x i32> zeroinitializer, %add1
196 %c = icmp sge <8 x i32> %add1, zeroinitializer
197 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
198 %result = trunc <8 x i32> %s to <8 x i16>
199 ret <8 x i16> %result
202 define arm_aapcs_vfpcc <4 x i16> @vabd_v4u16(<4 x i16> %src1, <4 x i16> %src2) {
203 ; CHECK-LABEL: vabd_v4u16:
205 ; CHECK-NEXT: vmovlb.u16 q1, q1
206 ; CHECK-NEXT: vmovlb.u16 q0, q0
207 ; CHECK-NEXT: vabd.u32 q0, q0, q1
209 %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
210 %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
211 %add1 = sub <4 x i32> %zextsrc1, %zextsrc2
212 %add2 = sub <4 x i32> zeroinitializer, %add1
213 %c = icmp sge <4 x i32> %add1, zeroinitializer
214 %s = select <4 x i1> %c, <4 x i32> %add1, <4 x i32> %add2
215 %result = trunc <4 x i32> %s to <4 x i16>
216 ret <4 x i16> %result
219 define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
220 ; CHECK-LABEL: vabd_u32:
222 ; CHECK-NEXT: vabd.u32 q0, q0, q1
224 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
225 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
226 %add1 = sub <4 x i64> %zextsrc1, %zextsrc2
227 %add2 = sub <4 x i64> zeroinitializer, %add1
228 %c = icmp sge <4 x i64> %add1, zeroinitializer
229 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
230 %result = trunc <4 x i64> %s to <4 x i32>
231 ret <4 x i32> %result
234 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32(<4 x i32> %src1, <4 x i32> %src2) {
235 ; CHECK-LABEL: vabd_v4u32:
237 ; CHECK-NEXT: vabd.u32 q0, q0, q1
239 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
240 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
241 %add1 = sub <4 x i64> %zextsrc1, %zextsrc2
242 %add2 = sub <4 x i64> zeroinitializer, %add1
243 %c = icmp sge <4 x i64> %add1, zeroinitializer
244 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
245 %result = trunc <4 x i64> %s to <4 x i32>
246 ret <4 x i32> %result
249 define arm_aapcs_vfpcc <2 x i32> @vabd_v2u32(<2 x i32> %src1, <2 x i32> %src2) {
250 ; CHECK-LABEL: vabd_v2u32:
252 ; CHECK-NEXT: .save {r7, lr}
253 ; CHECK-NEXT: push {r7, lr}
254 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff
255 ; CHECK-NEXT: vand q1, q1, q2
256 ; CHECK-NEXT: vand q0, q0, q2
257 ; CHECK-NEXT: vmov r0, r1, d3
258 ; CHECK-NEXT: vmov r2, r3, d1
259 ; CHECK-NEXT: subs r0, r2, r0
260 ; CHECK-NEXT: sbc.w r1, r3, r1
261 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31
262 ; CHECK-NEXT: eor.w r2, r1, r1, asr #31
263 ; CHECK-NEXT: subs.w lr, r0, r1, asr #31
264 ; CHECK-NEXT: sbc.w r12, r2, r1, asr #31
265 ; CHECK-NEXT: vmov r2, r3, d2
266 ; CHECK-NEXT: vmov r1, r0, d0
267 ; CHECK-NEXT: subs r1, r1, r2
268 ; CHECK-NEXT: sbcs r0, r3
269 ; CHECK-NEXT: eor.w r1, r1, r0, asr #31
270 ; CHECK-NEXT: subs.w r1, r1, r0, asr #31
271 ; CHECK-NEXT: vmov q0[2], q0[0], r1, lr
272 ; CHECK-NEXT: eor.w r1, r0, r0, asr #31
273 ; CHECK-NEXT: sbc.w r0, r1, r0, asr #31
274 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
275 ; CHECK-NEXT: pop {r7, pc}
276 %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
277 %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
278 %add1 = sub <2 x i64> %zextsrc1, %zextsrc2
279 %add2 = sub <2 x i64> zeroinitializer, %add1
280 %c = icmp sge <2 x i64> %add1, zeroinitializer
281 %s = select <2 x i1> %c, <2 x i64> %add1, <2 x i64> %add2
282 %result = trunc <2 x i64> %s to <2 x i32>
283 ret <2 x i32> %result
286 define void @vabd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
287 ; CHECK-LABEL: vabd_loop_s8:
288 ; CHECK: @ %bb.0: @ %entry
289 ; CHECK-NEXT: .save {r7, lr}
290 ; CHECK-NEXT: push {r7, lr}
291 ; CHECK-NEXT: mov.w lr, #64
292 ; CHECK-NEXT: .LBB15_1: @ %vector.body
293 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
294 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
295 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
296 ; CHECK-NEXT: vabd.s8 q0, q1, q0
297 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
298 ; CHECK-NEXT: le lr, .LBB15_1
299 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
300 ; CHECK-NEXT: pop {r7, pc}
302 br label %vector.body
304 vector.body: ; preds = %vector.body, %entry
305 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
306 %0 = getelementptr inbounds i8, ptr %x, i32 %index
307 %wide.load = load <16 x i8>, ptr %0, align 1
308 %1 = sext <16 x i8> %wide.load to <16 x i32>
309 %2 = getelementptr inbounds i8, ptr %y, i32 %index
310 %wide.load22 = load <16 x i8>, ptr %2, align 1
311 %3 = sext <16 x i8> %wide.load22 to <16 x i32>
312 %4 = sub nsw <16 x i32> %1, %3
313 %5 = icmp slt <16 x i32> %4, zeroinitializer
314 %6 = sub nsw <16 x i32> zeroinitializer, %4
315 %7 = select <16 x i1> %5, <16 x i32> %6, <16 x i32> %4
316 %8 = trunc <16 x i32> %7 to <16 x i8>
317 %9 = getelementptr inbounds i8, ptr %z, i32 %index
318 store <16 x i8> %8, ptr %9, align 1
319 %index.next = add i32 %index, 16
320 %10 = icmp eq i32 %index.next, 1024
321 br i1 %10, label %for.cond.cleanup, label %vector.body
323 for.cond.cleanup: ; preds = %vector.body
327 define void @vabd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
328 ; CHECK-LABEL: vabd_loop_s16:
329 ; CHECK: @ %bb.0: @ %entry
330 ; CHECK-NEXT: .save {r7, lr}
331 ; CHECK-NEXT: push {r7, lr}
332 ; CHECK-NEXT: mov.w lr, #128
333 ; CHECK-NEXT: .LBB16_1: @ %vector.body
334 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
335 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
336 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
337 ; CHECK-NEXT: vabd.s16 q0, q1, q0
338 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
339 ; CHECK-NEXT: le lr, .LBB16_1
340 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
341 ; CHECK-NEXT: pop {r7, pc}
343 br label %vector.body
345 vector.body: ; preds = %vector.body, %entry
346 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
347 %0 = getelementptr inbounds i16, ptr %x, i32 %index
348 %wide.load = load <8 x i16>, ptr %0, align 2
349 %1 = sext <8 x i16> %wide.load to <8 x i32>
350 %2 = getelementptr inbounds i16, ptr %y, i32 %index
351 %wide.load22 = load <8 x i16>, ptr %2, align 2
352 %3 = sext <8 x i16> %wide.load22 to <8 x i32>
353 %4 = sub nsw <8 x i32> %1, %3
354 %5 = icmp slt <8 x i32> %4, zeroinitializer
355 %6 = sub nsw <8 x i32> zeroinitializer, %4
356 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
357 %8 = trunc <8 x i32> %7 to <8 x i16>
358 %9 = getelementptr inbounds i16, ptr %z, i32 %index
359 store <8 x i16> %8, ptr %9, align 2
360 %index.next = add i32 %index, 8
361 %10 = icmp eq i32 %index.next, 1024
362 br i1 %10, label %for.cond.cleanup, label %vector.body
364 for.cond.cleanup: ; preds = %vector.body
368 define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
369 ; CHECK-LABEL: vabd_loop_s32:
370 ; CHECK: @ %bb.0: @ %entry
371 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
372 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
373 ; CHECK-NEXT: mov.w lr, #256
374 ; CHECK-NEXT: vmov.i32 q0, #0x0
375 ; CHECK-NEXT: .LBB17_1: @ %vector.body
376 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
377 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
378 ; CHECK-NEXT: vmov.f32 s8, s6
379 ; CHECK-NEXT: vmov r7, s4
380 ; CHECK-NEXT: vmov.f32 s6, s7
381 ; CHECK-NEXT: vmov r3, s8
382 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
383 ; CHECK-NEXT: vmov.f32 s12, s10
384 ; CHECK-NEXT: vmov.f32 s10, s5
385 ; CHECK-NEXT: vmov.f32 s14, s11
386 ; CHECK-NEXT: vmov r4, s12
387 ; CHECK-NEXT: asr.w r12, r3, #31
388 ; CHECK-NEXT: subs.w r8, r3, r4
389 ; CHECK-NEXT: sbc.w r12, r12, r4, asr #31
390 ; CHECK-NEXT: vmov r4, s10
391 ; CHECK-NEXT: vmov.f32 s10, s9
392 ; CHECK-NEXT: vmov r6, s10
393 ; CHECK-NEXT: asrs r3, r4, #31
394 ; CHECK-NEXT: subs r4, r4, r6
395 ; CHECK-NEXT: sbc.w r9, r3, r6, asr #31
396 ; CHECK-NEXT: vmov r6, s8
397 ; CHECK-NEXT: vmov r3, s6
398 ; CHECK-NEXT: subs r5, r7, r6
399 ; CHECK-NEXT: asr.w r7, r7, #31
400 ; CHECK-NEXT: vmov q2[2], q2[0], r5, r8
401 ; CHECK-NEXT: vmov r5, s14
402 ; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
403 ; CHECK-NEXT: asrs r6, r6, #31
404 ; CHECK-NEXT: subs r7, r3, r5
405 ; CHECK-NEXT: asr.w r3, r3, #31
406 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r7
407 ; CHECK-NEXT: mov.w r7, #0
408 ; CHECK-NEXT: sbc.w r3, r3, r5, asr #31
409 ; CHECK-NEXT: bfi r7, r6, #0, #4
410 ; CHECK-NEXT: asr.w r4, r9, #31
411 ; CHECK-NEXT: asr.w r6, r12, #31
412 ; CHECK-NEXT: bfi r7, r4, #4, #4
413 ; CHECK-NEXT: asrs r3, r3, #31
414 ; CHECK-NEXT: bfi r7, r6, #8, #4
415 ; CHECK-NEXT: bfi r7, r3, #12, #4
416 ; CHECK-NEXT: vmsr p0, r7
418 ; CHECK-NEXT: vsubt.i32 q2, q0, q2
419 ; CHECK-NEXT: vstrb.8 q2, [r2], #16
420 ; CHECK-NEXT: le lr, .LBB17_1
421 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
422 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
424 br label %vector.body
426 vector.body: ; preds = %vector.body, %entry
427 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
428 %0 = getelementptr inbounds i32, ptr %x, i32 %index
429 %wide.load = load <4 x i32>, ptr %0, align 4
430 %1 = sext <4 x i32> %wide.load to <4 x i64>
431 %2 = getelementptr inbounds i32, ptr %y, i32 %index
432 %wide.load23 = load <4 x i32>, ptr %2, align 4
433 %3 = sext <4 x i32> %wide.load23 to <4 x i64>
434 %4 = sub nsw <4 x i64> %1, %3
435 %5 = icmp slt <4 x i64> %4, zeroinitializer
436 %6 = trunc <4 x i64> %4 to <4 x i32>
437 %7 = sub <4 x i32> zeroinitializer, %6
438 %8 = select <4 x i1> %5, <4 x i32> %7, <4 x i32> %6
439 %9 = getelementptr inbounds i32, ptr %z, i32 %index
440 store <4 x i32> %8, ptr %9, align 4
441 %index.next = add i32 %index, 4
442 %10 = icmp eq i32 %index.next, 1024
443 br i1 %10, label %for.cond.cleanup, label %vector.body
445 for.cond.cleanup: ; preds = %vector.body
449 define void @vabd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
450 ; CHECK-LABEL: vabd_loop_u8:
451 ; CHECK: @ %bb.0: @ %entry
452 ; CHECK-NEXT: .save {r7, lr}
453 ; CHECK-NEXT: push {r7, lr}
454 ; CHECK-NEXT: mov.w lr, #64
455 ; CHECK-NEXT: .LBB18_1: @ %vector.body
456 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
457 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
458 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
459 ; CHECK-NEXT: vabd.u8 q0, q1, q0
460 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
461 ; CHECK-NEXT: le lr, .LBB18_1
462 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
463 ; CHECK-NEXT: pop {r7, pc}
465 br label %vector.body
467 vector.body: ; preds = %vector.body, %entry
468 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
469 %0 = getelementptr inbounds i8, ptr %x, i32 %index
470 %wide.load = load <16 x i8>, ptr %0, align 1
471 %1 = zext <16 x i8> %wide.load to <16 x i32>
472 %2 = getelementptr inbounds i8, ptr %y, i32 %index
473 %wide.load22 = load <16 x i8>, ptr %2, align 1
474 %3 = zext <16 x i8> %wide.load22 to <16 x i32>
475 %4 = sub nsw <16 x i32> %1, %3
476 %5 = icmp slt <16 x i32> %4, zeroinitializer
477 %6 = sub nsw <16 x i32> zeroinitializer, %4
478 %7 = select <16 x i1> %5, <16 x i32> %6, <16 x i32> %4
479 %8 = trunc <16 x i32> %7 to <16 x i8>
480 %9 = getelementptr inbounds i8, ptr %z, i32 %index
481 store <16 x i8> %8, ptr %9, align 1
482 %index.next = add i32 %index, 16
483 %10 = icmp eq i32 %index.next, 1024
484 br i1 %10, label %for.cond.cleanup, label %vector.body
486 for.cond.cleanup: ; preds = %vector.body
490 define void @vabd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
491 ; CHECK-LABEL: vabd_loop_u16:
492 ; CHECK: @ %bb.0: @ %entry
493 ; CHECK-NEXT: .save {r7, lr}
494 ; CHECK-NEXT: push {r7, lr}
495 ; CHECK-NEXT: mov.w lr, #128
496 ; CHECK-NEXT: .LBB19_1: @ %vector.body
497 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
498 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
499 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
500 ; CHECK-NEXT: vabd.u16 q0, q1, q0
501 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
502 ; CHECK-NEXT: le lr, .LBB19_1
503 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
504 ; CHECK-NEXT: pop {r7, pc}
506 br label %vector.body
508 vector.body: ; preds = %vector.body, %entry
509 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
510 %0 = getelementptr inbounds i16, ptr %x, i32 %index
511 %wide.load = load <8 x i16>, ptr %0, align 2
512 %1 = zext <8 x i16> %wide.load to <8 x i32>
513 %2 = getelementptr inbounds i16, ptr %y, i32 %index
514 %wide.load22 = load <8 x i16>, ptr %2, align 2
515 %3 = zext <8 x i16> %wide.load22 to <8 x i32>
516 %4 = sub nsw <8 x i32> %1, %3
517 %5 = icmp slt <8 x i32> %4, zeroinitializer
518 %6 = sub nsw <8 x i32> zeroinitializer, %4
519 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
520 %8 = trunc <8 x i32> %7 to <8 x i16>
521 %9 = getelementptr inbounds i16, ptr %z, i32 %index
522 store <8 x i16> %8, ptr %9, align 2
523 %index.next = add i32 %index, 8
524 %10 = icmp eq i32 %index.next, 1024
525 br i1 %10, label %for.cond.cleanup, label %vector.body
527 for.cond.cleanup: ; preds = %vector.body
531 define void @vabd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
532 ; CHECK-LABEL: vabd_loop_u32:
533 ; CHECK: @ %bb.0: @ %entry
534 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
535 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
536 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
537 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
538 ; CHECK-NEXT: mov.w lr, #256
539 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
540 ; CHECK-NEXT: vmov.i32 q1, #0x0
541 ; CHECK-NEXT: .LBB20_1: @ %vector.body
542 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
543 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16
544 ; CHECK-NEXT: vldrw.u32 q5, [r0], #16
545 ; CHECK-NEXT: vmov.f32 s8, s18
546 ; CHECK-NEXT: vmov.f32 s10, s19
547 ; CHECK-NEXT: vmov.f32 s12, s22
548 ; CHECK-NEXT: vand q2, q2, q0
549 ; CHECK-NEXT: vmov.f32 s14, s23
550 ; CHECK-NEXT: vand q3, q3, q0
551 ; CHECK-NEXT: vmov r3, r12, d4
552 ; CHECK-NEXT: vmov r4, r5, d6
553 ; CHECK-NEXT: vmov.f32 s18, s17
554 ; CHECK-NEXT: vmov.f32 s22, s21
555 ; CHECK-NEXT: vand q4, q4, q0
556 ; CHECK-NEXT: vand q5, q5, q0
557 ; CHECK-NEXT: vmov r6, r7, d11
558 ; CHECK-NEXT: subs.w r8, r4, r3
559 ; CHECK-NEXT: sbc.w r12, r5, r12
560 ; CHECK-NEXT: vmov r5, r3, d9
561 ; CHECK-NEXT: subs.w r10, r6, r5
562 ; CHECK-NEXT: sbc.w r9, r7, r3
563 ; CHECK-NEXT: vmov r6, r7, d8
564 ; CHECK-NEXT: vmov r4, r3, d10
565 ; CHECK-NEXT: subs r4, r4, r6
566 ; CHECK-NEXT: sbcs r3, r7
567 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r8
568 ; CHECK-NEXT: vmov r4, r6, d5
569 ; CHECK-NEXT: vmov r7, r5, d7
570 ; CHECK-NEXT: asrs r3, r3, #31
571 ; CHECK-NEXT: subs r4, r7, r4
572 ; CHECK-NEXT: vmov q4[3], q4[1], r10, r4
573 ; CHECK-NEXT: mov.w r4, #0
574 ; CHECK-NEXT: bfi r4, r3, #0, #4
575 ; CHECK-NEXT: asr.w r3, r9, #31
576 ; CHECK-NEXT: bfi r4, r3, #4, #4
577 ; CHECK-NEXT: asr.w r3, r12, #31
578 ; CHECK-NEXT: bfi r4, r3, #8, #4
579 ; CHECK-NEXT: sbc.w r3, r5, r6
580 ; CHECK-NEXT: asrs r3, r3, #31
581 ; CHECK-NEXT: bfi r4, r3, #12, #4
582 ; CHECK-NEXT: vmsr p0, r4
584 ; CHECK-NEXT: vsubt.i32 q4, q1, q4
585 ; CHECK-NEXT: vstrb.8 q4, [r2], #16
586 ; CHECK-NEXT: le lr, .LBB20_1
587 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
588 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
589 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
591 br label %vector.body
593 vector.body: ; preds = %vector.body, %entry
594 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
595 %0 = getelementptr inbounds i32, ptr %x, i32 %index
596 %wide.load = load <4 x i32>, ptr %0, align 4
597 %1 = zext <4 x i32> %wide.load to <4 x i64>
598 %2 = getelementptr inbounds i32, ptr %y, i32 %index
599 %wide.load23 = load <4 x i32>, ptr %2, align 4
600 %3 = zext <4 x i32> %wide.load23 to <4 x i64>
601 %4 = sub nsw <4 x i64> %1, %3
602 %5 = icmp slt <4 x i64> %4, zeroinitializer
603 %6 = trunc <4 x i64> %4 to <4 x i32>
604 %7 = sub <4 x i32> zeroinitializer, %6
605 %8 = select <4 x i1> %5, <4 x i32> %7, <4 x i32> %6
606 %9 = getelementptr inbounds i32, ptr %z, i32 %index
607 store <4 x i32> %8, ptr %9, align 4
608 %index.next = add i32 %index, 4
609 %10 = icmp eq i32 %index.next, 1024
610 br i1 %10, label %for.cond.cleanup, label %vector.body
612 for.cond.cleanup: ; preds = %vector.body
616 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_commutative(<4 x i32> %src1, <4 x i32> %src2) {
617 ; CHECK-LABEL: vabd_v4u32_commutative:
619 ; CHECK-NEXT: vabd.u32 q0, q1, q0
620 ; CHECK-NEXT: vadd.i32 q0, q0, q0
622 %azextsrc1 = zext <4 x i32> %src1 to <4 x i64>
623 %azextsrc2 = zext <4 x i32> %src2 to <4 x i64>
624 %aadd1 = sub <4 x i64> %azextsrc1, %azextsrc2
625 %aadd2 = sub <4 x i64> zeroinitializer, %aadd1
626 %ac = icmp sge <4 x i64> %aadd1, zeroinitializer
627 %as = select <4 x i1> %ac, <4 x i64> %aadd1, <4 x i64> %aadd2
628 %aresult = trunc <4 x i64> %as to <4 x i32>
629 %bzextsrc1 = zext <4 x i32> %src2 to <4 x i64>
630 %bzextsrc2 = zext <4 x i32> %src1 to <4 x i64>
631 %badd1 = sub <4 x i64> %bzextsrc1, %bzextsrc2
632 %badd2 = sub <4 x i64> zeroinitializer, %badd1
633 %bc = icmp sge <4 x i64> %badd1, zeroinitializer
634 %bs = select <4 x i1> %bc, <4 x i64> %badd1, <4 x i64> %badd2
635 %bresult = trunc <4 x i64> %bs to <4 x i32>
636 %r = add <4 x i32> %aresult, %bresult
640 define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_shuffle(<4 x i32> %src1, <4 x i32> %src2) {
641 ; CHECK-LABEL: vabd_v4u32_shuffle:
643 ; CHECK-NEXT: vabd.u32 q1, q0, q1
644 ; CHECK-NEXT: vmov.f32 s0, s7
645 ; CHECK-NEXT: vmov.f32 s1, s6
646 ; CHECK-NEXT: vmov.f32 s2, s5
647 ; CHECK-NEXT: vmov.f32 s3, s4
649 %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
650 %s2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
651 %azextsrc1 = zext <4 x i32> %s1 to <4 x i64>
652 %azextsrc2 = zext <4 x i32> %s2 to <4 x i64>
653 %aadd1 = sub <4 x i64> %azextsrc1, %azextsrc2
654 %aadd2 = sub <4 x i64> zeroinitializer, %aadd1
655 %ac = icmp sge <4 x i64> %aadd1, zeroinitializer
656 %as = select <4 x i1> %ac, <4 x i64> %aadd1, <4 x i64> %aadd2
657 %aresult = trunc <4 x i64> %as to <4 x i32>
658 ret <4 x i32> %aresult
662 define arm_aapcs_vfpcc i16 @vabds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
663 ; CHECK-LABEL: vabds_reduce_v16i8:
664 ; CHECK: @ %bb.0: @ %entry
665 ; CHECK-NEXT: vabd.s8 q0, q0, q1
666 ; CHECK-NEXT: vaddv.u8 r0, q0
669 %sextsrc1 = sext <16 x i8> %s0 to <16 x i16>
670 %sextsrc2 = sext <16 x i8> %s1 to <16 x i16>
671 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
672 %add2 = sub <16 x i16> zeroinitializer, %add1
673 %c = icmp sge <16 x i16> %add1, zeroinitializer
674 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
675 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
679 define arm_aapcs_vfpcc i16 @vabdu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
680 ; CHECK-LABEL: vabdu_reduce_v16i8:
681 ; CHECK: @ %bb.0: @ %entry
682 ; CHECK-NEXT: vabd.u8 q0, q0, q1
683 ; CHECK-NEXT: vaddv.u8 r0, q0
686 %sextsrc1 = zext <16 x i8> %s0 to <16 x i16>
687 %sextsrc2 = zext <16 x i8> %s1 to <16 x i16>
688 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
689 %add2 = sub <16 x i16> zeroinitializer, %add1
690 %c = icmp sge <16 x i16> %add1, zeroinitializer
691 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
692 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
696 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)