1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @thres_i32(ptr %data, i16 zeroext %N, i32 %T) {
5 ; CHECK-LABEL: thres_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r1, #0
11 ; CHECK-NEXT: popeq {r7, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
13 ; CHECK-NEXT: mvn r3, #3
14 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2
15 ; CHECK-NEXT: movs r3, #1
16 ; CHECK-NEXT: vmov.i32 q0, #0x0
17 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2
18 ; CHECK-NEXT: rsbs r1, r2, #0
19 ; CHECK-NEXT: .LBB0_2: @ %vector.body
20 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
21 ; CHECK-NEXT: vldrw.u32 q1, [r0]
22 ; CHECK-NEXT: vpte.s32 ge, q1, r2
23 ; CHECK-NEXT: vcmpt.s32 le, q1, r1
24 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
25 ; CHECK-NEXT: le lr, .LBB0_2
26 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
27 ; CHECK-NEXT: pop {r7, pc}
29 %conv = zext i16 %N to i32
30 %mul = shl nuw nsw i32 %conv, 2
31 %cmp15 = icmp eq i16 %N, 0
32 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
34 vector.ph: ; preds = %entry
35 %sub = sub nsw i32 0, %T
36 %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
37 %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
38 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
39 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
42 vector.body: ; preds = %vector.body, %vector.ph
43 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
44 %0 = getelementptr inbounds i32, ptr %data, i32 %index
45 %wide.load = load <4 x i32>, ptr %0, align 4
46 %1 = icmp slt <4 x i32> %wide.load, %broadcast.splat18
47 %2 = icmp sgt <4 x i32> %wide.load, %broadcast.splat20
48 %3 = or <4 x i1> %1, %2
49 call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
50 %index.next = add i32 %index, 4
51 %4 = icmp eq i32 %index.next, %mul
52 br i1 %4, label %for.cond.cleanup, label %vector.body
54 for.cond.cleanup: ; preds = %vector.body, %entry
58 define arm_aapcs_vfpcc void @thresh_i16(ptr %data, i16 zeroext %N, i16 signext %T) {
59 ; CHECK-LABEL: thresh_i16:
60 ; CHECK: @ %bb.0: @ %entry
61 ; CHECK-NEXT: .save {r7, lr}
62 ; CHECK-NEXT: push {r7, lr}
63 ; CHECK-NEXT: cmp r1, #0
65 ; CHECK-NEXT: popeq {r7, pc}
66 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
67 ; CHECK-NEXT: mvn r3, #7
68 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
69 ; CHECK-NEXT: movs r3, #1
70 ; CHECK-NEXT: vmov.i32 q0, #0x0
71 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
72 ; CHECK-NEXT: rsbs r1, r2, #0
73 ; CHECK-NEXT: .LBB1_2: @ %vector.body
74 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
75 ; CHECK-NEXT: vldrh.u16 q1, [r0]
76 ; CHECK-NEXT: vpte.s16 ge, q1, r2
77 ; CHECK-NEXT: vcmpt.s16 le, q1, r1
78 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
79 ; CHECK-NEXT: le lr, .LBB1_2
80 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
81 ; CHECK-NEXT: pop {r7, pc}
83 %conv2 = zext i16 %N to i32
84 %mul = shl nuw nsw i32 %conv2, 3
85 %cmp22 = icmp eq i16 %N, 0
86 br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
88 vector.ph: ; preds = %entry
90 %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
91 %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
92 %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
93 %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
96 vector.body: ; preds = %vector.body, %vector.ph
97 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
98 %0 = getelementptr inbounds i16, ptr %data, i32 %index
99 %wide.load = load <8 x i16>, ptr %0, align 2
100 %1 = icmp slt <8 x i16> %wide.load, %broadcast.splat25
101 %2 = icmp sgt <8 x i16> %wide.load, %broadcast.splat27
102 %3 = or <8 x i1> %1, %2
103 call void @llvm.masked.store.v8i16.p0(<8 x i16> zeroinitializer, ptr %0, i32 2, <8 x i1> %3)
104 %index.next = add i32 %index, 8
105 %4 = icmp eq i32 %index.next, %mul
106 br i1 %4, label %for.cond.cleanup, label %vector.body
108 for.cond.cleanup: ; preds = %vector.body, %entry
112 define arm_aapcs_vfpcc void @thresh_i8(ptr %data, i16 zeroext %N, i8 signext %T) {
113 ; CHECK-LABEL: thresh_i8:
114 ; CHECK: @ %bb.0: @ %entry
115 ; CHECK-NEXT: .save {r7, lr}
116 ; CHECK-NEXT: push {r7, lr}
117 ; CHECK-NEXT: cmp r1, #0
119 ; CHECK-NEXT: popeq {r7, pc}
120 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
121 ; CHECK-NEXT: mvn r3, #15
122 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4
123 ; CHECK-NEXT: movs r3, #1
124 ; CHECK-NEXT: vmov.i32 q0, #0x0
125 ; CHECK-NEXT: add.w lr, r3, r1, lsr #4
126 ; CHECK-NEXT: rsbs r1, r2, #0
127 ; CHECK-NEXT: .LBB2_2: @ %vector.body
128 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
129 ; CHECK-NEXT: vldrb.u8 q1, [r0]
130 ; CHECK-NEXT: vpte.s8 ge, q1, r2
131 ; CHECK-NEXT: vcmpt.s8 le, q1, r1
132 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16
133 ; CHECK-NEXT: le lr, .LBB2_2
134 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
135 ; CHECK-NEXT: pop {r7, pc}
137 %conv2 = zext i16 %N to i32
138 %mul = shl nuw nsw i32 %conv2, 4
139 %cmp20 = icmp eq i16 %N, 0
140 br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
142 vector.ph: ; preds = %entry
144 %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
145 %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
146 %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
147 %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
148 br label %vector.body
150 vector.body: ; preds = %vector.body, %vector.ph
151 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
152 %0 = getelementptr inbounds i8, ptr %data, i32 %index
153 %wide.load = load <16 x i8>, ptr %0, align 1
154 %1 = icmp slt <16 x i8> %wide.load, %broadcast.splat23
155 %2 = icmp sgt <16 x i8> %wide.load, %broadcast.splat25
156 %3 = or <16 x i1> %1, %2
157 call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr %0, i32 1, <16 x i1> %3)
158 %index.next = add i32 %index, 16
159 %4 = icmp eq i32 %index.next, %mul
160 br i1 %4, label %for.cond.cleanup, label %vector.body
162 for.cond.cleanup: ; preds = %vector.body, %entry
166 define arm_aapcs_vfpcc void @thresh_f32(ptr %data, i16 zeroext %N, float %T) {
167 ; CHECK-LABEL: thresh_f32:
168 ; CHECK: @ %bb.0: @ %entry
169 ; CHECK-NEXT: .save {r7, lr}
170 ; CHECK-NEXT: push {r7, lr}
171 ; CHECK-NEXT: cmp r1, #0
173 ; CHECK-NEXT: popeq {r7, pc}
174 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
175 ; CHECK-NEXT: mvn r2, #3
176 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
177 ; CHECK-NEXT: movs r2, #1
178 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2
179 ; CHECK-NEXT: vmov r1, s0
180 ; CHECK-NEXT: vmov.i32 q0, #0x0
181 ; CHECK-NEXT: eor r2, r1, #-2147483648
182 ; CHECK-NEXT: .LBB3_2: @ %vector.body
183 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
184 ; CHECK-NEXT: vldrw.u32 q1, [r0]
185 ; CHECK-NEXT: vpte.f32 ge, q1, r1
186 ; CHECK-NEXT: vcmpt.f32 le, q1, r2
187 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
188 ; CHECK-NEXT: le lr, .LBB3_2
189 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
190 ; CHECK-NEXT: pop {r7, pc}
192 %conv = zext i16 %N to i32
193 %mul = shl nuw nsw i32 %conv, 2
194 %cmp15 = icmp eq i16 %N, 0
195 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
197 vector.ph: ; preds = %entry
198 %fneg = fneg fast float %T
199 %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
200 %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
201 %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
202 %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
203 br label %vector.body
205 vector.body: ; preds = %vector.body, %vector.ph
206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
207 %0 = getelementptr inbounds float, ptr %data, i32 %index
208 %wide.load = load <4 x float>, ptr %0, align 4
209 %1 = fcmp fast olt <4 x float> %wide.load, %broadcast.splat18
210 %2 = fcmp fast ogt <4 x float> %wide.load, %broadcast.splat20
211 %3 = or <4 x i1> %1, %2
212 call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
213 %index.next = add i32 %index, 4
214 %4 = icmp eq i32 %index.next, %mul
215 br i1 %4, label %for.cond.cleanup, label %vector.body
217 for.cond.cleanup: ; preds = %vector.body, %entry
221 define arm_aapcs_vfpcc void @thresh_f16(ptr %data, i16 zeroext %N, float %T.coerce) {
222 ; CHECK-LABEL: thresh_f16:
223 ; CHECK: @ %bb.0: @ %entry
224 ; CHECK-NEXT: .save {r7, lr}
225 ; CHECK-NEXT: push {r7, lr}
226 ; CHECK-NEXT: cmp r1, #0
228 ; CHECK-NEXT: popeq {r7, pc}
229 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
230 ; CHECK-NEXT: mvn r3, #7
231 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
232 ; CHECK-NEXT: vmov r2, s0
233 ; CHECK-NEXT: vneg.f16 s0, s0
234 ; CHECK-NEXT: movs r3, #1
235 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
236 ; CHECK-NEXT: vmov.f16 r1, s0
237 ; CHECK-NEXT: vmov.i32 q0, #0x0
238 ; CHECK-NEXT: .LBB4_2: @ %vector.body
239 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
240 ; CHECK-NEXT: vldrh.u16 q1, [r0]
241 ; CHECK-NEXT: vpte.f16 ge, q1, r2
242 ; CHECK-NEXT: vcmpt.f16 le, q1, r1
243 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
244 ; CHECK-NEXT: le lr, .LBB4_2
245 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
246 ; CHECK-NEXT: pop {r7, pc}
248 %0 = bitcast float %T.coerce to i32
249 %tmp.0.extract.trunc = trunc i32 %0 to i16
250 %1 = bitcast i16 %tmp.0.extract.trunc to half
251 %conv = zext i16 %N to i32
252 %mul = shl nuw nsw i32 %conv, 3
253 %cmp17 = icmp eq i16 %N, 0
254 br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
256 vector.ph: ; preds = %entry
257 %fneg = fneg fast half %1
258 %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
259 %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
260 %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
261 %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
262 br label %vector.body
264 vector.body: ; preds = %vector.body, %vector.ph
265 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
266 %2 = getelementptr inbounds half, ptr %data, i32 %index
267 %wide.load = load <8 x half>, ptr %2, align 2
268 %3 = fcmp fast olt <8 x half> %wide.load, %broadcast.splat20
269 %4 = fcmp fast ogt <8 x half> %wide.load, %broadcast.splat22
270 %5 = or <8 x i1> %3, %4
271 call void @llvm.masked.store.v8f16.p0(<8 x half> zeroinitializer, ptr %2, i32 2, <8 x i1> %5)
272 %index.next = add i32 %index, 8
273 %6 = icmp eq i32 %index.next, %mul
274 br i1 %6, label %for.cond.cleanup, label %vector.body
276 for.cond.cleanup: ; preds = %vector.body, %entry
282 define arm_aapcs_vfpcc void @thres_rev_i32(ptr %data, i16 zeroext %N, i32 %T) {
283 ; CHECK-LABEL: thres_rev_i32:
284 ; CHECK: @ %bb.0: @ %entry
285 ; CHECK-NEXT: .save {r7, lr}
286 ; CHECK-NEXT: push {r7, lr}
287 ; CHECK-NEXT: cmp r1, #0
289 ; CHECK-NEXT: popeq {r7, pc}
290 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
291 ; CHECK-NEXT: mvn r3, #3
292 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2
293 ; CHECK-NEXT: movs r3, #1
294 ; CHECK-NEXT: vmov.i32 q0, #0x0
295 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2
296 ; CHECK-NEXT: rsbs r1, r2, #0
297 ; CHECK-NEXT: .LBB5_2: @ %vector.body
298 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
299 ; CHECK-NEXT: vldrw.u32 q1, [r0]
300 ; CHECK-NEXT: vpte.s32 ge, q1, r2
301 ; CHECK-NEXT: vcmpt.s32 le, q1, r1
302 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
303 ; CHECK-NEXT: le lr, .LBB5_2
304 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
305 ; CHECK-NEXT: pop {r7, pc}
307 %conv = zext i16 %N to i32
308 %mul = shl nuw nsw i32 %conv, 2
309 %cmp15 = icmp eq i16 %N, 0
310 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
312 vector.ph: ; preds = %entry
313 %sub = sub nsw i32 0, %T
314 %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
315 %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
316 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
317 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
318 br label %vector.body
320 vector.body: ; preds = %vector.body, %vector.ph
321 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
322 %0 = getelementptr inbounds i32, ptr %data, i32 %index
323 %wide.load = load <4 x i32>, ptr %0, align 4
324 %1 = icmp sgt <4 x i32> %broadcast.splat18, %wide.load
325 %2 = icmp slt <4 x i32> %broadcast.splat20, %wide.load
326 %3 = or <4 x i1> %1, %2
327 call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
328 %index.next = add i32 %index, 4
329 %4 = icmp eq i32 %index.next, %mul
330 br i1 %4, label %for.cond.cleanup, label %vector.body
332 for.cond.cleanup: ; preds = %vector.body, %entry
336 define arm_aapcs_vfpcc void @thresh_rev_i16(ptr %data, i16 zeroext %N, i16 signext %T) {
337 ; CHECK-LABEL: thresh_rev_i16:
338 ; CHECK: @ %bb.0: @ %entry
339 ; CHECK-NEXT: .save {r7, lr}
340 ; CHECK-NEXT: push {r7, lr}
341 ; CHECK-NEXT: cmp r1, #0
343 ; CHECK-NEXT: popeq {r7, pc}
344 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
345 ; CHECK-NEXT: mvn r3, #7
346 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
347 ; CHECK-NEXT: movs r3, #1
348 ; CHECK-NEXT: vmov.i32 q0, #0x0
349 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
350 ; CHECK-NEXT: rsbs r1, r2, #0
351 ; CHECK-NEXT: .LBB6_2: @ %vector.body
352 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
353 ; CHECK-NEXT: vldrh.u16 q1, [r0]
354 ; CHECK-NEXT: vpte.s16 ge, q1, r2
355 ; CHECK-NEXT: vcmpt.s16 le, q1, r1
356 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
357 ; CHECK-NEXT: le lr, .LBB6_2
358 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
359 ; CHECK-NEXT: pop {r7, pc}
361 %conv2 = zext i16 %N to i32
362 %mul = shl nuw nsw i32 %conv2, 3
363 %cmp22 = icmp eq i16 %N, 0
364 br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
366 vector.ph: ; preds = %entry
368 %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
369 %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
370 %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
371 %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
372 br label %vector.body
374 vector.body: ; preds = %vector.body, %vector.ph
375 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
376 %0 = getelementptr inbounds i16, ptr %data, i32 %index
377 %wide.load = load <8 x i16>, ptr %0, align 2
378 %1 = icmp sgt <8 x i16> %broadcast.splat25, %wide.load
379 %2 = icmp slt <8 x i16> %broadcast.splat27, %wide.load
380 %3 = or <8 x i1> %1, %2
381 call void @llvm.masked.store.v8i16.p0(<8 x i16> zeroinitializer, ptr %0, i32 2, <8 x i1> %3)
382 %index.next = add i32 %index, 8
383 %4 = icmp eq i32 %index.next, %mul
384 br i1 %4, label %for.cond.cleanup, label %vector.body
386 for.cond.cleanup: ; preds = %vector.body, %entry
390 define arm_aapcs_vfpcc void @thresh_rev_i8(ptr %data, i16 zeroext %N, i8 signext %T) {
391 ; CHECK-LABEL: thresh_rev_i8:
392 ; CHECK: @ %bb.0: @ %entry
393 ; CHECK-NEXT: .save {r7, lr}
394 ; CHECK-NEXT: push {r7, lr}
395 ; CHECK-NEXT: cmp r1, #0
397 ; CHECK-NEXT: popeq {r7, pc}
398 ; CHECK-NEXT: .LBB7_1: @ %vector.ph
399 ; CHECK-NEXT: mvn r3, #15
400 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4
401 ; CHECK-NEXT: movs r3, #1
402 ; CHECK-NEXT: vmov.i32 q0, #0x0
403 ; CHECK-NEXT: add.w lr, r3, r1, lsr #4
404 ; CHECK-NEXT: rsbs r1, r2, #0
405 ; CHECK-NEXT: .LBB7_2: @ %vector.body
406 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
407 ; CHECK-NEXT: vldrb.u8 q1, [r0]
408 ; CHECK-NEXT: vpte.s8 ge, q1, r2
409 ; CHECK-NEXT: vcmpt.s8 le, q1, r1
410 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16
411 ; CHECK-NEXT: le lr, .LBB7_2
412 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
413 ; CHECK-NEXT: pop {r7, pc}
415 %conv2 = zext i16 %N to i32
416 %mul = shl nuw nsw i32 %conv2, 4
417 %cmp20 = icmp eq i16 %N, 0
418 br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
420 vector.ph: ; preds = %entry
422 %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
423 %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
424 %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
425 %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
426 br label %vector.body
428 vector.body: ; preds = %vector.body, %vector.ph
429 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
430 %0 = getelementptr inbounds i8, ptr %data, i32 %index
431 %wide.load = load <16 x i8>, ptr %0, align 1
432 %1 = icmp sgt <16 x i8> %broadcast.splat23, %wide.load
433 %2 = icmp slt <16 x i8> %broadcast.splat25, %wide.load
434 %3 = or <16 x i1> %1, %2
435 call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr %0, i32 1, <16 x i1> %3)
436 %index.next = add i32 %index, 16
437 %4 = icmp eq i32 %index.next, %mul
438 br i1 %4, label %for.cond.cleanup, label %vector.body
440 for.cond.cleanup: ; preds = %vector.body, %entry
444 define arm_aapcs_vfpcc void @thresh_rev_f32(ptr %data, i16 zeroext %N, float %T) {
445 ; CHECK-LABEL: thresh_rev_f32:
446 ; CHECK: @ %bb.0: @ %entry
447 ; CHECK-NEXT: .save {r7, lr}
448 ; CHECK-NEXT: push {r7, lr}
449 ; CHECK-NEXT: cmp r1, #0
451 ; CHECK-NEXT: popeq {r7, pc}
452 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
453 ; CHECK-NEXT: mvn r2, #3
454 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
455 ; CHECK-NEXT: movs r2, #1
456 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2
457 ; CHECK-NEXT: vmov r1, s0
458 ; CHECK-NEXT: vmov.i32 q0, #0x0
459 ; CHECK-NEXT: eor r2, r1, #-2147483648
460 ; CHECK-NEXT: .LBB8_2: @ %vector.body
461 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
462 ; CHECK-NEXT: vldrw.u32 q1, [r0]
463 ; CHECK-NEXT: vpte.f32 ge, q1, r1
464 ; CHECK-NEXT: vcmpt.f32 le, q1, r2
465 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
466 ; CHECK-NEXT: le lr, .LBB8_2
467 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
468 ; CHECK-NEXT: pop {r7, pc}
470 %conv = zext i16 %N to i32
471 %mul = shl nuw nsw i32 %conv, 2
472 %cmp15 = icmp eq i16 %N, 0
473 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
475 vector.ph: ; preds = %entry
476 %fneg = fneg fast float %T
477 %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
478 %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
479 %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
480 %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
481 br label %vector.body
483 vector.body: ; preds = %vector.body, %vector.ph
484 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
485 %0 = getelementptr inbounds float, ptr %data, i32 %index
486 %wide.load = load <4 x float>, ptr %0, align 4
487 %1 = fcmp fast ogt <4 x float> %broadcast.splat18, %wide.load
488 %2 = fcmp fast olt <4 x float> %broadcast.splat20, %wide.load
489 %3 = or <4 x i1> %1, %2
490 call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
491 %index.next = add i32 %index, 4
492 %4 = icmp eq i32 %index.next, %mul
493 br i1 %4, label %for.cond.cleanup, label %vector.body
495 for.cond.cleanup: ; preds = %vector.body, %entry
499 define arm_aapcs_vfpcc void @thresh_rev_f16(ptr %data, i16 zeroext %N, float %T.coerce) {
500 ; CHECK-LABEL: thresh_rev_f16:
501 ; CHECK: @ %bb.0: @ %entry
502 ; CHECK-NEXT: .save {r7, lr}
503 ; CHECK-NEXT: push {r7, lr}
504 ; CHECK-NEXT: cmp r1, #0
506 ; CHECK-NEXT: popeq {r7, pc}
507 ; CHECK-NEXT: .LBB9_1: @ %vector.ph
508 ; CHECK-NEXT: mvn r3, #7
509 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
510 ; CHECK-NEXT: vmov r2, s0
511 ; CHECK-NEXT: vneg.f16 s0, s0
512 ; CHECK-NEXT: movs r3, #1
513 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
514 ; CHECK-NEXT: vmov.f16 r1, s0
515 ; CHECK-NEXT: vmov.i32 q0, #0x0
516 ; CHECK-NEXT: .LBB9_2: @ %vector.body
517 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
518 ; CHECK-NEXT: vldrh.u16 q1, [r0]
519 ; CHECK-NEXT: vpte.f16 ge, q1, r2
520 ; CHECK-NEXT: vcmpt.f16 le, q1, r1
521 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
522 ; CHECK-NEXT: le lr, .LBB9_2
523 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
524 ; CHECK-NEXT: pop {r7, pc}
526 %0 = bitcast float %T.coerce to i32
527 %tmp.0.extract.trunc = trunc i32 %0 to i16
528 %1 = bitcast i16 %tmp.0.extract.trunc to half
529 %conv = zext i16 %N to i32
530 %mul = shl nuw nsw i32 %conv, 3
531 %cmp17 = icmp eq i16 %N, 0
532 br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
534 vector.ph: ; preds = %entry
535 %fneg = fneg fast half %1
536 %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
537 %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
538 %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
539 %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
540 br label %vector.body
542 vector.body: ; preds = %vector.body, %vector.ph
543 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
544 %2 = getelementptr inbounds half, ptr %data, i32 %index
545 %wide.load = load <8 x half>, ptr %2, align 2
546 %3 = fcmp fast ogt <8 x half> %broadcast.splat20, %wide.load
547 %4 = fcmp fast olt <8 x half> %broadcast.splat22, %wide.load
548 %5 = or <8 x i1> %3, %4
549 call void @llvm.masked.store.v8f16.p0(<8 x half> zeroinitializer, ptr %2, i32 2, <8 x i1> %5)
550 %index.next = add i32 %index, 8
551 %6 = icmp eq i32 %index.next, %mul
552 br i1 %6, label %for.cond.cleanup, label %vector.body
554 for.cond.cleanup: ; preds = %vector.body, %entry
561 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
562 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
563 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
564 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
565 declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)