1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) {
5 ; CHECK-LABEL: thres_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r1, #0
11 ; CHECK-NEXT: popeq {r7, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
13 ; CHECK-NEXT: mvn r3, #3
14 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2
15 ; CHECK-NEXT: movs r3, #1
16 ; CHECK-NEXT: vmov.i32 q0, #0x0
17 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2
18 ; CHECK-NEXT: rsbs r1, r2, #0
19 ; CHECK-NEXT: .LBB0_2: @ %vector.body
20 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
21 ; CHECK-NEXT: vldrw.u32 q1, [r0]
22 ; CHECK-NEXT: vpte.s32 ge, q1, r2
23 ; CHECK-NEXT: vcmpt.s32 le, q1, r1
24 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
25 ; CHECK-NEXT: le lr, .LBB0_2
26 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
27 ; CHECK-NEXT: pop {r7, pc}
29 %conv = zext i16 %N to i32
30 %mul = shl nuw nsw i32 %conv, 2
31 %cmp15 = icmp eq i16 %N, 0
32 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
34 vector.ph: ; preds = %entry
35 %sub = sub nsw i32 0, %T
36 %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
37 %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
38 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
39 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
42 vector.body: ; preds = %vector.body, %vector.ph
43 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
44 %0 = getelementptr inbounds i32, i32* %data, i32 %index
45 %1 = bitcast i32* %0 to <4 x i32>*
46 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
47 %2 = icmp slt <4 x i32> %wide.load, %broadcast.splat18
48 %3 = icmp sgt <4 x i32> %wide.load, %broadcast.splat20
49 %4 = or <4 x i1> %2, %3
50 %5 = bitcast i32* %0 to <4 x i32>*
51 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %5, i32 4, <4 x i1> %4)
52 %index.next = add i32 %index, 4
53 %6 = icmp eq i32 %index.next, %mul
54 br i1 %6, label %for.cond.cleanup, label %vector.body
56 for.cond.cleanup: ; preds = %vector.body, %entry
60 define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext %T) {
61 ; CHECK-LABEL: thresh_i16:
62 ; CHECK: @ %bb.0: @ %entry
63 ; CHECK-NEXT: .save {r7, lr}
64 ; CHECK-NEXT: push {r7, lr}
65 ; CHECK-NEXT: cmp r1, #0
67 ; CHECK-NEXT: popeq {r7, pc}
68 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
69 ; CHECK-NEXT: mvn r3, #7
70 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
71 ; CHECK-NEXT: movs r3, #1
72 ; CHECK-NEXT: vmov.i32 q0, #0x0
73 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
74 ; CHECK-NEXT: rsbs r1, r2, #0
75 ; CHECK-NEXT: .LBB1_2: @ %vector.body
76 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
77 ; CHECK-NEXT: vldrh.u16 q1, [r0]
78 ; CHECK-NEXT: vpte.s16 ge, q1, r2
79 ; CHECK-NEXT: vcmpt.s16 le, q1, r1
80 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
81 ; CHECK-NEXT: le lr, .LBB1_2
82 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
83 ; CHECK-NEXT: pop {r7, pc}
85 %conv2 = zext i16 %N to i32
86 %mul = shl nuw nsw i32 %conv2, 3
87 %cmp22 = icmp eq i16 %N, 0
88 br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
90 vector.ph: ; preds = %entry
92 %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
93 %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
94 %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
95 %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
98 vector.body: ; preds = %vector.body, %vector.ph
99 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
100 %0 = getelementptr inbounds i16, i16* %data, i32 %index
101 %1 = bitcast i16* %0 to <8 x i16>*
102 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
103 %2 = icmp slt <8 x i16> %wide.load, %broadcast.splat25
104 %3 = icmp sgt <8 x i16> %wide.load, %broadcast.splat27
105 %4 = or <8 x i1> %2, %3
106 %5 = bitcast i16* %0 to <8 x i16>*
107 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> zeroinitializer, <8 x i16>* %5, i32 2, <8 x i1> %4)
108 %index.next = add i32 %index, 8
109 %6 = icmp eq i32 %index.next, %mul
110 br i1 %6, label %for.cond.cleanup, label %vector.body
112 for.cond.cleanup: ; preds = %vector.body, %entry
116 define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T) {
117 ; CHECK-LABEL: thresh_i8:
118 ; CHECK: @ %bb.0: @ %entry
119 ; CHECK-NEXT: .save {r7, lr}
120 ; CHECK-NEXT: push {r7, lr}
121 ; CHECK-NEXT: cmp r1, #0
123 ; CHECK-NEXT: popeq {r7, pc}
124 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
125 ; CHECK-NEXT: mvn r3, #15
126 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4
127 ; CHECK-NEXT: movs r3, #1
128 ; CHECK-NEXT: vmov.i32 q0, #0x0
129 ; CHECK-NEXT: add.w lr, r3, r1, lsr #4
130 ; CHECK-NEXT: rsbs r1, r2, #0
131 ; CHECK-NEXT: .LBB2_2: @ %vector.body
132 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
133 ; CHECK-NEXT: vldrb.u8 q1, [r0]
134 ; CHECK-NEXT: vpte.s8 ge, q1, r2
135 ; CHECK-NEXT: vcmpt.s8 le, q1, r1
136 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16
137 ; CHECK-NEXT: le lr, .LBB2_2
138 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
139 ; CHECK-NEXT: pop {r7, pc}
141 %conv2 = zext i16 %N to i32
142 %mul = shl nuw nsw i32 %conv2, 4
143 %cmp20 = icmp eq i16 %N, 0
144 br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
146 vector.ph: ; preds = %entry
148 %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
149 %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
150 %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
151 %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
152 br label %vector.body
154 vector.body: ; preds = %vector.body, %vector.ph
155 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
156 %0 = getelementptr inbounds i8, i8* %data, i32 %index
157 %1 = bitcast i8* %0 to <16 x i8>*
158 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
159 %2 = icmp slt <16 x i8> %wide.load, %broadcast.splat23
160 %3 = icmp sgt <16 x i8> %wide.load, %broadcast.splat25
161 %4 = or <16 x i1> %2, %3
162 %5 = bitcast i8* %0 to <16 x i8>*
163 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> zeroinitializer, <16 x i8>* %5, i32 1, <16 x i1> %4)
164 %index.next = add i32 %index, 16
165 %6 = icmp eq i32 %index.next, %mul
166 br i1 %6, label %for.cond.cleanup, label %vector.body
168 for.cond.cleanup: ; preds = %vector.body, %entry
172 define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) {
173 ; CHECK-LABEL: thresh_f32:
174 ; CHECK: @ %bb.0: @ %entry
175 ; CHECK-NEXT: .save {r7, lr}
176 ; CHECK-NEXT: push {r7, lr}
177 ; CHECK-NEXT: cmp r1, #0
179 ; CHECK-NEXT: popeq {r7, pc}
180 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
181 ; CHECK-NEXT: mvn r2, #3
182 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
183 ; CHECK-NEXT: movs r2, #1
184 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2
185 ; CHECK-NEXT: vmov r1, s0
186 ; CHECK-NEXT: vmov.i32 q0, #0x0
187 ; CHECK-NEXT: eor r2, r1, #-2147483648
188 ; CHECK-NEXT: .LBB3_2: @ %vector.body
189 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
190 ; CHECK-NEXT: vldrw.u32 q1, [r0]
191 ; CHECK-NEXT: vpte.f32 ge, q1, r1
192 ; CHECK-NEXT: vcmpt.f32 le, q1, r2
193 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
194 ; CHECK-NEXT: le lr, .LBB3_2
195 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
196 ; CHECK-NEXT: pop {r7, pc}
198 %conv = zext i16 %N to i32
199 %mul = shl nuw nsw i32 %conv, 2
200 %cmp15 = icmp eq i16 %N, 0
201 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
203 vector.ph: ; preds = %entry
204 %fneg = fneg fast float %T
205 %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
206 %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
207 %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
208 %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
209 br label %vector.body
211 vector.body: ; preds = %vector.body, %vector.ph
212 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
213 %0 = getelementptr inbounds float, float* %data, i32 %index
214 %1 = bitcast float* %0 to <4 x float>*
215 %wide.load = load <4 x float>, <4 x float>* %1, align 4
216 %2 = fcmp fast olt <4 x float> %wide.load, %broadcast.splat18
217 %3 = fcmp fast ogt <4 x float> %wide.load, %broadcast.splat20
218 %4 = or <4 x i1> %2, %3
219 %5 = bitcast float* %0 to <4 x float>*
220 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %5, i32 4, <4 x i1> %4)
221 %index.next = add i32 %index, 4
222 %6 = icmp eq i32 %index.next, %mul
223 br i1 %6, label %for.cond.cleanup, label %vector.body
225 for.cond.cleanup: ; preds = %vector.body, %entry
229 define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.coerce) {
230 ; CHECK-LABEL: thresh_f16:
231 ; CHECK: @ %bb.0: @ %entry
232 ; CHECK-NEXT: .save {r7, lr}
233 ; CHECK-NEXT: push {r7, lr}
234 ; CHECK-NEXT: cmp r1, #0
236 ; CHECK-NEXT: popeq {r7, pc}
237 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
238 ; CHECK-NEXT: mvn r3, #7
239 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
240 ; CHECK-NEXT: vmov r2, s0
241 ; CHECK-NEXT: vneg.f16 s0, s0
242 ; CHECK-NEXT: movs r3, #1
243 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
244 ; CHECK-NEXT: vmov.f16 r1, s0
245 ; CHECK-NEXT: vmov.i32 q0, #0x0
246 ; CHECK-NEXT: .LBB4_2: @ %vector.body
247 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
248 ; CHECK-NEXT: vldrh.u16 q1, [r0]
249 ; CHECK-NEXT: vpte.f16 ge, q1, r2
250 ; CHECK-NEXT: vcmpt.f16 le, q1, r1
251 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
252 ; CHECK-NEXT: le lr, .LBB4_2
253 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
254 ; CHECK-NEXT: pop {r7, pc}
256 %0 = bitcast float %T.coerce to i32
257 %tmp.0.extract.trunc = trunc i32 %0 to i16
258 %1 = bitcast i16 %tmp.0.extract.trunc to half
259 %conv = zext i16 %N to i32
260 %mul = shl nuw nsw i32 %conv, 3
261 %cmp17 = icmp eq i16 %N, 0
262 br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
264 vector.ph: ; preds = %entry
265 %fneg = fneg fast half %1
266 %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
267 %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
268 %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
269 %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
270 br label %vector.body
272 vector.body: ; preds = %vector.body, %vector.ph
273 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
274 %2 = getelementptr inbounds half, half* %data, i32 %index
275 %3 = bitcast half* %2 to <8 x half>*
276 %wide.load = load <8 x half>, <8 x half>* %3, align 2
277 %4 = fcmp fast olt <8 x half> %wide.load, %broadcast.splat20
278 %5 = fcmp fast ogt <8 x half> %wide.load, %broadcast.splat22
279 %6 = or <8 x i1> %4, %5
280 %7 = bitcast half* %2 to <8 x half>*
281 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> zeroinitializer, <8 x half>* %7, i32 2, <8 x i1> %6)
282 %index.next = add i32 %index, 8
283 %8 = icmp eq i32 %index.next, %mul
284 br i1 %8, label %for.cond.cleanup, label %vector.body
286 for.cond.cleanup: ; preds = %vector.body, %entry
292 define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) {
293 ; CHECK-LABEL: thres_rev_i32:
294 ; CHECK: @ %bb.0: @ %entry
295 ; CHECK-NEXT: .save {r7, lr}
296 ; CHECK-NEXT: push {r7, lr}
297 ; CHECK-NEXT: cmp r1, #0
299 ; CHECK-NEXT: popeq {r7, pc}
300 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
301 ; CHECK-NEXT: mvn r3, #3
302 ; CHECK-NEXT: add.w r1, r3, r1, lsl #2
303 ; CHECK-NEXT: movs r3, #1
304 ; CHECK-NEXT: vmov.i32 q0, #0x0
305 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2
306 ; CHECK-NEXT: rsbs r1, r2, #0
307 ; CHECK-NEXT: .LBB5_2: @ %vector.body
308 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
309 ; CHECK-NEXT: vldrw.u32 q1, [r0]
310 ; CHECK-NEXT: vpte.s32 ge, q1, r2
311 ; CHECK-NEXT: vcmpt.s32 le, q1, r1
312 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
313 ; CHECK-NEXT: le lr, .LBB5_2
314 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
315 ; CHECK-NEXT: pop {r7, pc}
317 %conv = zext i16 %N to i32
318 %mul = shl nuw nsw i32 %conv, 2
319 %cmp15 = icmp eq i16 %N, 0
320 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
322 vector.ph: ; preds = %entry
323 %sub = sub nsw i32 0, %T
324 %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
325 %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
326 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
327 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
328 br label %vector.body
330 vector.body: ; preds = %vector.body, %vector.ph
331 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
332 %0 = getelementptr inbounds i32, i32* %data, i32 %index
333 %1 = bitcast i32* %0 to <4 x i32>*
334 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
335 %2 = icmp sgt <4 x i32> %broadcast.splat18, %wide.load
336 %3 = icmp slt <4 x i32> %broadcast.splat20, %wide.load
337 %4 = or <4 x i1> %2, %3
338 %5 = bitcast i32* %0 to <4 x i32>*
339 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %5, i32 4, <4 x i1> %4)
340 %index.next = add i32 %index, 4
341 %6 = icmp eq i32 %index.next, %mul
342 br i1 %6, label %for.cond.cleanup, label %vector.body
344 for.cond.cleanup: ; preds = %vector.body, %entry
348 define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 signext %T) {
349 ; CHECK-LABEL: thresh_rev_i16:
350 ; CHECK: @ %bb.0: @ %entry
351 ; CHECK-NEXT: .save {r7, lr}
352 ; CHECK-NEXT: push {r7, lr}
353 ; CHECK-NEXT: cmp r1, #0
355 ; CHECK-NEXT: popeq {r7, pc}
356 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
357 ; CHECK-NEXT: mvn r3, #7
358 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
359 ; CHECK-NEXT: movs r3, #1
360 ; CHECK-NEXT: vmov.i32 q0, #0x0
361 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
362 ; CHECK-NEXT: rsbs r1, r2, #0
363 ; CHECK-NEXT: .LBB6_2: @ %vector.body
364 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
365 ; CHECK-NEXT: vldrh.u16 q1, [r0]
366 ; CHECK-NEXT: vpte.s16 ge, q1, r2
367 ; CHECK-NEXT: vcmpt.s16 le, q1, r1
368 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
369 ; CHECK-NEXT: le lr, .LBB6_2
370 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
371 ; CHECK-NEXT: pop {r7, pc}
373 %conv2 = zext i16 %N to i32
374 %mul = shl nuw nsw i32 %conv2, 3
375 %cmp22 = icmp eq i16 %N, 0
376 br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
378 vector.ph: ; preds = %entry
380 %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
381 %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
382 %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
383 %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
384 br label %vector.body
386 vector.body: ; preds = %vector.body, %vector.ph
387 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
388 %0 = getelementptr inbounds i16, i16* %data, i32 %index
389 %1 = bitcast i16* %0 to <8 x i16>*
390 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
391 %2 = icmp sgt <8 x i16> %broadcast.splat25, %wide.load
392 %3 = icmp slt <8 x i16> %broadcast.splat27, %wide.load
393 %4 = or <8 x i1> %2, %3
394 %5 = bitcast i16* %0 to <8 x i16>*
395 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> zeroinitializer, <8 x i16>* %5, i32 2, <8 x i1> %4)
396 %index.next = add i32 %index, 8
397 %6 = icmp eq i32 %index.next, %mul
398 br i1 %6, label %for.cond.cleanup, label %vector.body
400 for.cond.cleanup: ; preds = %vector.body, %entry
404 define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext %T) {
405 ; CHECK-LABEL: thresh_rev_i8:
406 ; CHECK: @ %bb.0: @ %entry
407 ; CHECK-NEXT: .save {r7, lr}
408 ; CHECK-NEXT: push {r7, lr}
409 ; CHECK-NEXT: cmp r1, #0
411 ; CHECK-NEXT: popeq {r7, pc}
412 ; CHECK-NEXT: .LBB7_1: @ %vector.ph
413 ; CHECK-NEXT: mvn r3, #15
414 ; CHECK-NEXT: add.w r1, r3, r1, lsl #4
415 ; CHECK-NEXT: movs r3, #1
416 ; CHECK-NEXT: vmov.i32 q0, #0x0
417 ; CHECK-NEXT: add.w lr, r3, r1, lsr #4
418 ; CHECK-NEXT: rsbs r1, r2, #0
419 ; CHECK-NEXT: .LBB7_2: @ %vector.body
420 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
421 ; CHECK-NEXT: vldrb.u8 q1, [r0]
422 ; CHECK-NEXT: vpte.s8 ge, q1, r2
423 ; CHECK-NEXT: vcmpt.s8 le, q1, r1
424 ; CHECK-NEXT: vstrbe.8 q0, [r0], #16
425 ; CHECK-NEXT: le lr, .LBB7_2
426 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
427 ; CHECK-NEXT: pop {r7, pc}
429 %conv2 = zext i16 %N to i32
430 %mul = shl nuw nsw i32 %conv2, 4
431 %cmp20 = icmp eq i16 %N, 0
432 br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
434 vector.ph: ; preds = %entry
436 %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
437 %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
438 %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
439 %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
440 br label %vector.body
442 vector.body: ; preds = %vector.body, %vector.ph
443 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
444 %0 = getelementptr inbounds i8, i8* %data, i32 %index
445 %1 = bitcast i8* %0 to <16 x i8>*
446 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
447 %2 = icmp sgt <16 x i8> %broadcast.splat23, %wide.load
448 %3 = icmp slt <16 x i8> %broadcast.splat25, %wide.load
449 %4 = or <16 x i1> %2, %3
450 %5 = bitcast i8* %0 to <16 x i8>*
451 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> zeroinitializer, <16 x i8>* %5, i32 1, <16 x i1> %4)
452 %index.next = add i32 %index, 16
453 %6 = icmp eq i32 %index.next, %mul
454 br i1 %6, label %for.cond.cleanup, label %vector.body
456 for.cond.cleanup: ; preds = %vector.body, %entry
460 define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float %T) {
461 ; CHECK-LABEL: thresh_rev_f32:
462 ; CHECK: @ %bb.0: @ %entry
463 ; CHECK-NEXT: .save {r7, lr}
464 ; CHECK-NEXT: push {r7, lr}
465 ; CHECK-NEXT: cmp r1, #0
467 ; CHECK-NEXT: popeq {r7, pc}
468 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
469 ; CHECK-NEXT: mvn r2, #3
470 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
471 ; CHECK-NEXT: movs r2, #1
472 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2
473 ; CHECK-NEXT: vmov r1, s0
474 ; CHECK-NEXT: vmov.i32 q0, #0x0
475 ; CHECK-NEXT: eor r2, r1, #-2147483648
476 ; CHECK-NEXT: .LBB8_2: @ %vector.body
477 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
478 ; CHECK-NEXT: vldrw.u32 q1, [r0]
479 ; CHECK-NEXT: vpte.f32 ge, q1, r1
480 ; CHECK-NEXT: vcmpt.f32 le, q1, r2
481 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16
482 ; CHECK-NEXT: le lr, .LBB8_2
483 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
484 ; CHECK-NEXT: pop {r7, pc}
486 %conv = zext i16 %N to i32
487 %mul = shl nuw nsw i32 %conv, 2
488 %cmp15 = icmp eq i16 %N, 0
489 br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
491 vector.ph: ; preds = %entry
492 %fneg = fneg fast float %T
493 %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
494 %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
495 %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
496 %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
497 br label %vector.body
499 vector.body: ; preds = %vector.body, %vector.ph
500 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
501 %0 = getelementptr inbounds float, float* %data, i32 %index
502 %1 = bitcast float* %0 to <4 x float>*
503 %wide.load = load <4 x float>, <4 x float>* %1, align 4
504 %2 = fcmp fast ogt <4 x float> %broadcast.splat18, %wide.load
505 %3 = fcmp fast olt <4 x float> %broadcast.splat20, %wide.load
506 %4 = or <4 x i1> %2, %3
507 %5 = bitcast float* %0 to <4 x float>*
508 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %5, i32 4, <4 x i1> %4)
509 %index.next = add i32 %index, 4
510 %6 = icmp eq i32 %index.next, %mul
511 br i1 %6, label %for.cond.cleanup, label %vector.body
513 for.cond.cleanup: ; preds = %vector.body, %entry
517 define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float %T.coerce) {
518 ; CHECK-LABEL: thresh_rev_f16:
519 ; CHECK: @ %bb.0: @ %entry
520 ; CHECK-NEXT: .save {r7, lr}
521 ; CHECK-NEXT: push {r7, lr}
522 ; CHECK-NEXT: cmp r1, #0
524 ; CHECK-NEXT: popeq {r7, pc}
525 ; CHECK-NEXT: .LBB9_1: @ %vector.ph
526 ; CHECK-NEXT: mvn r3, #7
527 ; CHECK-NEXT: add.w r1, r3, r1, lsl #3
528 ; CHECK-NEXT: vmov r2, s0
529 ; CHECK-NEXT: vneg.f16 s0, s0
530 ; CHECK-NEXT: movs r3, #1
531 ; CHECK-NEXT: add.w lr, r3, r1, lsr #3
532 ; CHECK-NEXT: vmov.f16 r1, s0
533 ; CHECK-NEXT: vmov.i32 q0, #0x0
534 ; CHECK-NEXT: .LBB9_2: @ %vector.body
535 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
536 ; CHECK-NEXT: vldrh.u16 q1, [r0]
537 ; CHECK-NEXT: vpte.f16 ge, q1, r2
538 ; CHECK-NEXT: vcmpt.f16 le, q1, r1
539 ; CHECK-NEXT: vstrhe.16 q0, [r0], #16
540 ; CHECK-NEXT: le lr, .LBB9_2
541 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
542 ; CHECK-NEXT: pop {r7, pc}
544 %0 = bitcast float %T.coerce to i32
545 %tmp.0.extract.trunc = trunc i32 %0 to i16
546 %1 = bitcast i16 %tmp.0.extract.trunc to half
547 %conv = zext i16 %N to i32
548 %mul = shl nuw nsw i32 %conv, 3
549 %cmp17 = icmp eq i16 %N, 0
550 br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
552 vector.ph: ; preds = %entry
553 %fneg = fneg fast half %1
554 %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
555 %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
556 %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
557 %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
558 br label %vector.body
560 vector.body: ; preds = %vector.body, %vector.ph
561 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
562 %2 = getelementptr inbounds half, half* %data, i32 %index
563 %3 = bitcast half* %2 to <8 x half>*
564 %wide.load = load <8 x half>, <8 x half>* %3, align 2
565 %4 = fcmp fast ogt <8 x half> %broadcast.splat20, %wide.load
566 %5 = fcmp fast olt <8 x half> %broadcast.splat22, %wide.load
567 %6 = or <8 x i1> %4, %5
568 %7 = bitcast half* %2 to <8 x half>*
569 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> zeroinitializer, <8 x half>* %7, i32 2, <8 x i1> %6)
570 %index.next = add i32 %index, 8
571 %8 = icmp eq i32 %index.next, %mul
572 br i1 %8, label %for.cond.cleanup, label %vector.body
574 for.cond.cleanup: ; preds = %vector.body, %entry
581 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
582 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
583 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
584 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
585 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)