1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 -enable-arm-maskedldst=true -disable-mve-tail-predication=false %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
5 ; CHECK-LABEL: fast_float_mul:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
8 ; CHECK-NEXT: cmp r3, #0
9 ; CHECK-NEXT: beq.w .LBB0_11
10 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
11 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
12 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
13 ; CHECK-NEXT: cmp r4, r2
14 ; CHECK-NEXT: mov.w r12, #1
15 ; CHECK-NEXT: cset lr, hi
16 ; CHECK-NEXT: cmp r5, r0
17 ; CHECK-NEXT: cset r6, hi
18 ; CHECK-NEXT: cmp r4, r1
19 ; CHECK-NEXT: add.w r5, r1, r3, lsl #2
20 ; CHECK-NEXT: cset r4, hi
21 ; CHECK-NEXT: cmp r5, r0
22 ; CHECK-NEXT: cset r5, hi
23 ; CHECK-NEXT: ands r5, r4
24 ; CHECK-NEXT: lsls r5, r5, #31
26 ; CHECK-NEXT: andeq.w r6, r6, lr
27 ; CHECK-NEXT: lslseq.w r6, r6, #31
28 ; CHECK-NEXT: beq .LBB0_4
29 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
30 ; CHECK-NEXT: subs r6, r3, #1
31 ; CHECK-NEXT: and r7, r3, #3
32 ; CHECK-NEXT: cmp r6, #3
33 ; CHECK-NEXT: bhs .LBB0_6
34 ; CHECK-NEXT: @ %bb.3:
35 ; CHECK-NEXT: mov r8, r7
36 ; CHECK-NEXT: mov.w r12, #0
37 ; CHECK-NEXT: b .LBB0_8
38 ; CHECK-NEXT: .LBB0_4: @ %vector.ph
39 ; CHECK-NEXT: adds r6, r3, #3
40 ; CHECK-NEXT: bic r6, r6, #3
41 ; CHECK-NEXT: subs r6, #4
42 ; CHECK-NEXT: add.w lr, r12, r6, lsr #2
43 ; CHECK-NEXT: dls lr, lr
44 ; CHECK-NEXT: .LBB0_5: @ %vector.body
45 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
46 ; CHECK-NEXT: vctp.32 r3
48 ; CHECK-NEXT: vldrwt.u32 q0, [r1]
49 ; CHECK-NEXT: vldrwt.u32 q1, [r2]
50 ; CHECK-NEXT: vmul.f32 q0, q1, q0
52 ; CHECK-NEXT: vstrwt.32 q0, [r0]
53 ; CHECK-NEXT: adds r1, #16
54 ; CHECK-NEXT: adds r2, #16
55 ; CHECK-NEXT: adds r0, #16
56 ; CHECK-NEXT: subs r3, #4
57 ; CHECK-NEXT: le lr, .LBB0_5
58 ; CHECK-NEXT: b .LBB0_11
59 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new
60 ; CHECK-NEXT: subs r3, r3, r7
61 ; CHECK-NEXT: mov r8, r7
62 ; CHECK-NEXT: subs r3, #4
63 ; CHECK-NEXT: add.w lr, r12, r3, lsr #2
64 ; CHECK-NEXT: movs r3, #0
65 ; CHECK-NEXT: mov.w r12, #0
66 ; CHECK-NEXT: dls lr, lr
67 ; CHECK-NEXT: .LBB0_7: @ %for.body
68 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
69 ; CHECK-NEXT: adds r4, r1, r3
70 ; CHECK-NEXT: adds r5, r2, r3
71 ; CHECK-NEXT: adds r6, r0, r3
72 ; CHECK-NEXT: adds r3, #16
73 ; CHECK-NEXT: vldr s0, [r4]
74 ; CHECK-NEXT: add.w r12, r12, #4
75 ; CHECK-NEXT: vldr s2, [r5]
76 ; CHECK-NEXT: vmul.f32 s0, s2, s0
77 ; CHECK-NEXT: vstr s0, [r6]
78 ; CHECK-NEXT: vldr s0, [r4, #4]
79 ; CHECK-NEXT: vldr s2, [r5, #4]
80 ; CHECK-NEXT: vmul.f32 s0, s2, s0
81 ; CHECK-NEXT: vstr s0, [r6, #4]
82 ; CHECK-NEXT: vldr s0, [r4, #8]
83 ; CHECK-NEXT: vldr s2, [r5, #8]
84 ; CHECK-NEXT: vmul.f32 s0, s2, s0
85 ; CHECK-NEXT: vstr s0, [r6, #8]
86 ; CHECK-NEXT: vldr s0, [r4, #12]
87 ; CHECK-NEXT: vldr s2, [r5, #12]
88 ; CHECK-NEXT: vmul.f32 s0, s2, s0
89 ; CHECK-NEXT: vstr s0, [r6, #12]
90 ; CHECK-NEXT: le lr, .LBB0_7
91 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa
92 ; CHECK-NEXT: wls lr, r8, .LBB0_11
93 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
94 ; CHECK-NEXT: mvn r3, #3
95 ; CHECK-NEXT: mov lr, r8
96 ; CHECK-NEXT: add.w r3, r3, r12, lsl #2
97 ; CHECK-NEXT: add r1, r3
98 ; CHECK-NEXT: add r2, r3
99 ; CHECK-NEXT: add r0, r3
100 ; CHECK-NEXT: .LBB0_10: @ %for.body.epil
101 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
102 ; CHECK-NEXT: vldr s0, [r1, #4]
103 ; CHECK-NEXT: adds r1, #4
104 ; CHECK-NEXT: vldr s2, [r2, #4]
105 ; CHECK-NEXT: adds r2, #4
106 ; CHECK-NEXT: vmul.f32 s0, s2, s0
107 ; CHECK-NEXT: vstr s0, [r0, #4]
108 ; CHECK-NEXT: adds r0, #4
109 ; CHECK-NEXT: le lr, .LBB0_10
110 ; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup
111 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
113 %cmp8 = icmp eq i32 %N, 0
114 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
116 vector.memcheck: ; preds = %entry
117 %scevgep = getelementptr float, float* %a, i32 %N
118 %scevgep13 = getelementptr float, float* %b, i32 %N
119 %scevgep16 = getelementptr float, float* %c, i32 %N
120 %bound0 = icmp ugt float* %scevgep13, %a
121 %bound1 = icmp ugt float* %scevgep, %b
122 %found.conflict = and i1 %bound0, %bound1
123 %bound018 = icmp ugt float* %scevgep16, %a
124 %bound119 = icmp ugt float* %scevgep, %c
125 %found.conflict20 = and i1 %bound018, %bound119
126 %conflict.rdx = or i1 %found.conflict, %found.conflict20
127 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
129 for.body.preheader: ; preds = %vector.memcheck
131 %xtraiter = and i32 %N, 3
132 %1 = icmp ult i32 %0, 3
133 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
135 for.body.preheader.new: ; preds = %for.body.preheader
136 %unroll_iter = sub i32 %N, %xtraiter
139 vector.ph: ; preds = %vector.memcheck
140 %n.rnd.up = add i32 %N, 3
141 %n.vec = and i32 %n.rnd.up, -4
142 %trip.count.minus.1 = add i32 %N, -1
143 %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
144 %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
145 br label %vector.body
147 vector.body: ; preds = %vector.body, %vector.ph
148 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
149 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
150 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
151 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
152 %2 = getelementptr inbounds float, float* %b, i32 %index
153 %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
154 %4 = bitcast float* %2 to <4 x float>*
155 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
156 %5 = getelementptr inbounds float, float* %c, i32 %index
157 %6 = bitcast float* %5 to <4 x float>*
158 %wide.masked.load23 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %3, <4 x float> undef)
159 %7 = fmul fast <4 x float> %wide.masked.load23, %wide.masked.load
160 %8 = getelementptr inbounds float, float* %a, i32 %index
161 %9 = bitcast float* %8 to <4 x float>*
162 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %7, <4 x float>* %9, i32 4, <4 x i1> %3)
163 %index.next = add i32 %index, 4
164 %10 = icmp eq i32 %index.next, %n.vec
165 br i1 %10, label %for.cond.cleanup, label %vector.body
167 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
168 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
169 %lcmp.mod = icmp eq i32 %xtraiter, 0
170 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
172 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
173 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
174 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
175 %arrayidx.epil = getelementptr inbounds float, float* %b, i32 %i.09.epil
176 %11 = load float, float* %arrayidx.epil, align 4
177 %arrayidx1.epil = getelementptr inbounds float, float* %c, i32 %i.09.epil
178 %12 = load float, float* %arrayidx1.epil, align 4
179 %mul.epil = fmul fast float %12, %11
180 %arrayidx2.epil = getelementptr inbounds float, float* %a, i32 %i.09.epil
181 store float %mul.epil, float* %arrayidx2.epil, align 4
182 %inc.epil = add nuw i32 %i.09.epil, 1
183 %epil.iter.sub = add i32 %epil.iter, -1
184 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
185 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
187 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
190 for.body: ; preds = %for.body, %for.body.preheader.new
191 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
192 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
193 %arrayidx = getelementptr inbounds float, float* %b, i32 %i.09
194 %13 = load float, float* %arrayidx, align 4
195 %arrayidx1 = getelementptr inbounds float, float* %c, i32 %i.09
196 %14 = load float, float* %arrayidx1, align 4
197 %mul = fmul fast float %14, %13
198 %arrayidx2 = getelementptr inbounds float, float* %a, i32 %i.09
199 store float %mul, float* %arrayidx2, align 4
200 %inc = or i32 %i.09, 1
201 %arrayidx.1 = getelementptr inbounds float, float* %b, i32 %inc
202 %15 = load float, float* %arrayidx.1, align 4
203 %arrayidx1.1 = getelementptr inbounds float, float* %c, i32 %inc
204 %16 = load float, float* %arrayidx1.1, align 4
205 %mul.1 = fmul fast float %16, %15
206 %arrayidx2.1 = getelementptr inbounds float, float* %a, i32 %inc
207 store float %mul.1, float* %arrayidx2.1, align 4
208 %inc.1 = or i32 %i.09, 2
209 %arrayidx.2 = getelementptr inbounds float, float* %b, i32 %inc.1
210 %17 = load float, float* %arrayidx.2, align 4
211 %arrayidx1.2 = getelementptr inbounds float, float* %c, i32 %inc.1
212 %18 = load float, float* %arrayidx1.2, align 4
213 %mul.2 = fmul fast float %18, %17
214 %arrayidx2.2 = getelementptr inbounds float, float* %a, i32 %inc.1
215 store float %mul.2, float* %arrayidx2.2, align 4
216 %inc.2 = or i32 %i.09, 3
217 %arrayidx.3 = getelementptr inbounds float, float* %b, i32 %inc.2
218 %19 = load float, float* %arrayidx.3, align 4
219 %arrayidx1.3 = getelementptr inbounds float, float* %c, i32 %inc.2
220 %20 = load float, float* %arrayidx1.3, align 4
221 %mul.3 = fmul fast float %20, %19
222 %arrayidx2.3 = getelementptr inbounds float, float* %a, i32 %inc.2
223 store float %mul.3, float* %arrayidx2.3, align 4
224 %inc.3 = add nuw i32 %i.09, 4
225 %niter.nsub.3 = add i32 %niter, -4
226 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
227 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
230 define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
231 ; CHECK-LABEL: fast_float_mac:
232 ; CHECK: @ %bb.0: @ %entry
233 ; CHECK-NEXT: push {r7, lr}
234 ; CHECK-NEXT: cbz r2, .LBB1_4
235 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
236 ; CHECK-NEXT: adds r3, r2, #3
237 ; CHECK-NEXT: vmov.i32 q1, #0x0
238 ; CHECK-NEXT: bic r3, r3, #3
239 ; CHECK-NEXT: sub.w r12, r3, #4
240 ; CHECK-NEXT: movs r3, #1
241 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
242 ; CHECK-NEXT: dls lr, lr
243 ; CHECK-NEXT: .LBB1_2: @ %vector.body
244 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
245 ; CHECK-NEXT: vmov q0, q1
246 ; CHECK-NEXT: vctp.32 r2
248 ; CHECK-NEXT: vldrwt.u32 q1, [r0]
249 ; CHECK-NEXT: vldrwt.u32 q2, [r1]
250 ; CHECK-NEXT: mov r3, r2
251 ; CHECK-NEXT: vmul.f32 q1, q2, q1
252 ; CHECK-NEXT: adds r0, #16
253 ; CHECK-NEXT: adds r1, #16
254 ; CHECK-NEXT: subs r2, #4
255 ; CHECK-NEXT: vadd.f32 q1, q1, q0
256 ; CHECK-NEXT: le lr, .LBB1_2
257 ; CHECK-NEXT: @ %bb.3: @ %middle.block
258 ; CHECK-NEXT: vctp.32 r3
259 ; CHECK-NEXT: vpsel q0, q1, q0
260 ; CHECK-NEXT: vmov.f32 s4, s2
261 ; CHECK-NEXT: vmov.f32 s5, s3
262 ; CHECK-NEXT: vadd.f32 q0, q0, q1
263 ; CHECK-NEXT: vmov.32 r0, q0[1]
264 ; CHECK-NEXT: vdup.32 q1, r0
265 ; CHECK-NEXT: vadd.f32 q0, q0, q1
266 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0
267 ; CHECK-NEXT: pop {r7, pc}
268 ; CHECK-NEXT: .LBB1_4:
269 ; CHECK-NEXT: vldr s0, .LCPI1_0
270 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0
271 ; CHECK-NEXT: pop {r7, pc}
272 ; CHECK-NEXT: .p2align 2
273 ; CHECK-NEXT: @ %bb.5:
274 ; CHECK-NEXT: .LCPI1_0:
275 ; CHECK-NEXT: .long 0 @ float 0
277 %cmp8 = icmp eq i32 %N, 0
278 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
280 vector.ph: ; preds = %entry
281 %n.rnd.up = add i32 %N, 3
282 %n.vec = and i32 %n.rnd.up, -4
283 %trip.count.minus.1 = add i32 %N, -1
284 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
285 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
286 br label %vector.body
288 vector.body: ; preds = %vector.body, %vector.ph
289 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
290 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
291 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
292 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
293 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
294 %0 = getelementptr inbounds float, float* %b, i32 %index
295 %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
296 %2 = bitcast float* %0 to <4 x float>*
297 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
298 %3 = getelementptr inbounds float, float* %c, i32 %index
299 %4 = bitcast float* %3 to <4 x float>*
300 %wide.masked.load13 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
301 %5 = fmul fast <4 x float> %wide.masked.load13, %wide.masked.load
302 %6 = fadd fast <4 x float> %5, %vec.phi
303 %index.next = add i32 %index, 4
304 %7 = icmp eq i32 %index.next, %n.vec
305 br i1 %7, label %middle.block, label %vector.body
307 middle.block: ; preds = %vector.body
308 %8 = select <4 x i1> %1, <4 x float> %6, <4 x float> %vec.phi
309 %rdx.shuf = shufflevector <4 x float> %8, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
310 %bin.rdx = fadd fast <4 x float> %8, %rdx.shuf
311 %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
312 %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14
313 %9 = extractelement <4 x float> %bin.rdx15, i32 0
314 br label %for.cond.cleanup
316 for.cond.cleanup: ; preds = %middle.block, %entry
317 %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %9, %middle.block ]
321 define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, half* nocapture readonly %c, i32 %N) {
322 ; CHECK-LABEL: fast_float_half_mac:
323 ; CHECK: @ %bb.0: @ %entry
324 ; CHECK-NEXT: push {r4, lr}
325 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
326 ; CHECK-NEXT: sub sp, #8
327 ; CHECK-NEXT: cmp r2, #0
328 ; CHECK-NEXT: beq.w .LBB2_22
329 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
330 ; CHECK-NEXT: adds r3, r2, #3
331 ; CHECK-NEXT: subs r2, #1
332 ; CHECK-NEXT: bic r3, r3, #3
333 ; CHECK-NEXT: vmov.i32 q3, #0x0
334 ; CHECK-NEXT: sub.w r12, r3, #4
335 ; CHECK-NEXT: movs r3, #1
336 ; CHECK-NEXT: vdup.32 q0, r2
337 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
338 ; CHECK-NEXT: adr r3, .LCPI2_1
339 ; CHECK-NEXT: vldrw.u32 q1, [r3]
340 ; CHECK-NEXT: mov.w r12, #0
341 ; CHECK-NEXT: b .LBB2_4
342 ; CHECK-NEXT: .LBB2_2: @ %cond.load24
343 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
344 ; CHECK-NEXT: vmovx.f16 s16, s20
345 ; CHECK-NEXT: vmov r3, s20
346 ; CHECK-NEXT: vmov r2, s16
347 ; CHECK-NEXT: vmov.16 q4[0], r3
348 ; CHECK-NEXT: vmov.16 q4[1], r2
349 ; CHECK-NEXT: vmov r2, s21
350 ; CHECK-NEXT: vldr.16 s20, [r1, #6]
351 ; CHECK-NEXT: vmov.16 q4[2], r2
352 ; CHECK-NEXT: vmov r2, s20
353 ; CHECK-NEXT: vmov.16 q4[3], r2
354 ; CHECK-NEXT: .LBB2_3: @ %else25
355 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
356 ; CHECK-NEXT: vmul.f16 q3, q4, q3
357 ; CHECK-NEXT: sub.w lr, lr, #1
358 ; CHECK-NEXT: vmovx.f16 s18, s13
359 ; CHECK-NEXT: vmovx.f16 s16, s12
360 ; CHECK-NEXT: vcvtb.f32.f16 s23, s18
361 ; CHECK-NEXT: adds r0, #8
362 ; CHECK-NEXT: vcvtb.f32.f16 s22, s13
363 ; CHECK-NEXT: adds r1, #8
364 ; CHECK-NEXT: vcvtb.f32.f16 s21, s16
365 ; CHECK-NEXT: add.w r12, r12, #4
366 ; CHECK-NEXT: vcvtb.f32.f16 s20, s12
367 ; CHECK-NEXT: vadd.f32 q3, q2, q5
368 ; CHECK-NEXT: cmp.w lr, #0
369 ; CHECK-NEXT: bne .LBB2_4
370 ; CHECK-NEXT: b .LBB2_21
371 ; CHECK-NEXT: .LBB2_4: @ %vector.body
372 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
373 ; CHECK-NEXT: vmov q2, q3
374 ; CHECK-NEXT: vadd.i32 q3, q1, r12
375 ; CHECK-NEXT: vcmp.u32 cs, q0, q3
376 ; CHECK-NEXT: @ implicit-def: $q4
377 ; CHECK-NEXT: vmrs r2, p0
378 ; CHECK-NEXT: and r3, r2, #1
379 ; CHECK-NEXT: rsbs r4, r3, #0
380 ; CHECK-NEXT: movs r3, #0
381 ; CHECK-NEXT: bfi r3, r4, #0, #1
382 ; CHECK-NEXT: ubfx r4, r2, #4, #1
383 ; CHECK-NEXT: rsbs r4, r4, #0
384 ; CHECK-NEXT: bfi r3, r4, #1, #1
385 ; CHECK-NEXT: ubfx r4, r2, #8, #1
386 ; CHECK-NEXT: ubfx r2, r2, #12, #1
387 ; CHECK-NEXT: rsbs r4, r4, #0
388 ; CHECK-NEXT: bfi r3, r4, #2, #1
389 ; CHECK-NEXT: rsbs r2, r2, #0
390 ; CHECK-NEXT: bfi r3, r2, #3, #1
391 ; CHECK-NEXT: lsls r2, r3, #31
392 ; CHECK-NEXT: bne .LBB2_9
393 ; CHECK-NEXT: @ %bb.5: @ %else
394 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
395 ; CHECK-NEXT: lsls r2, r3, #30
396 ; CHECK-NEXT: bpl .LBB2_10
397 ; CHECK-NEXT: .LBB2_6: @ %cond.load5
398 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
399 ; CHECK-NEXT: vldr.16 s12, [r0, #2]
400 ; CHECK-NEXT: vmov r4, s16
401 ; CHECK-NEXT: vmovx.f16 s16, s17
402 ; CHECK-NEXT: vmov r2, s12
403 ; CHECK-NEXT: vmov.16 q3[0], r4
404 ; CHECK-NEXT: vmov.16 q3[1], r2
405 ; CHECK-NEXT: vmov r2, s17
406 ; CHECK-NEXT: vmov.16 q3[2], r2
407 ; CHECK-NEXT: vmov r2, s16
408 ; CHECK-NEXT: vmov.16 q3[3], r2
409 ; CHECK-NEXT: lsls r2, r3, #29
410 ; CHECK-NEXT: bmi .LBB2_11
411 ; CHECK-NEXT: .LBB2_7: @ in Loop: Header=BB2_4 Depth=1
412 ; CHECK-NEXT: vmov q4, q3
413 ; CHECK-NEXT: lsls r2, r3, #28
414 ; CHECK-NEXT: bmi .LBB2_12
415 ; CHECK-NEXT: .LBB2_8: @ in Loop: Header=BB2_4 Depth=1
416 ; CHECK-NEXT: vmov q3, q4
417 ; CHECK-NEXT: b .LBB2_13
418 ; CHECK-NEXT: .LBB2_9: @ %cond.load
419 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
420 ; CHECK-NEXT: vldr.16 s16, [r0]
421 ; CHECK-NEXT: lsls r2, r3, #30
422 ; CHECK-NEXT: bmi .LBB2_6
423 ; CHECK-NEXT: .LBB2_10: @ in Loop: Header=BB2_4 Depth=1
424 ; CHECK-NEXT: vmov q3, q4
425 ; CHECK-NEXT: lsls r2, r3, #29
426 ; CHECK-NEXT: bpl .LBB2_7
427 ; CHECK-NEXT: .LBB2_11: @ %cond.load8
428 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
429 ; CHECK-NEXT: vmovx.f16 s16, s12
430 ; CHECK-NEXT: vmov r2, s12
431 ; CHECK-NEXT: vldr.16 s20, [r0, #4]
432 ; CHECK-NEXT: vmov r4, s16
433 ; CHECK-NEXT: vmov.16 q4[0], r2
434 ; CHECK-NEXT: vmovx.f16 s12, s13
435 ; CHECK-NEXT: vmov.16 q4[1], r4
436 ; CHECK-NEXT: vmov r2, s20
437 ; CHECK-NEXT: vmov.16 q4[2], r2
438 ; CHECK-NEXT: vmov r2, s12
439 ; CHECK-NEXT: vmov.16 q4[3], r2
440 ; CHECK-NEXT: lsls r2, r3, #28
441 ; CHECK-NEXT: bpl .LBB2_8
442 ; CHECK-NEXT: .LBB2_12: @ %cond.load11
443 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
444 ; CHECK-NEXT: vmovx.f16 s12, s16
445 ; CHECK-NEXT: vmov r3, s16
446 ; CHECK-NEXT: vmov r2, s12
447 ; CHECK-NEXT: vmov.16 q3[0], r3
448 ; CHECK-NEXT: vmov.16 q3[1], r2
449 ; CHECK-NEXT: vmov r2, s17
450 ; CHECK-NEXT: vldr.16 s16, [r0, #6]
451 ; CHECK-NEXT: vmov.16 q3[2], r2
452 ; CHECK-NEXT: vmov r2, s16
453 ; CHECK-NEXT: vmov.16 q3[3], r2
454 ; CHECK-NEXT: .LBB2_13: @ %else12
455 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
456 ; CHECK-NEXT: vmrs r2, p0
457 ; CHECK-NEXT: @ implicit-def: $q5
458 ; CHECK-NEXT: and r3, r2, #1
459 ; CHECK-NEXT: rsbs r4, r3, #0
460 ; CHECK-NEXT: movs r3, #0
461 ; CHECK-NEXT: bfi r3, r4, #0, #1
462 ; CHECK-NEXT: ubfx r4, r2, #4, #1
463 ; CHECK-NEXT: rsbs r4, r4, #0
464 ; CHECK-NEXT: bfi r3, r4, #1, #1
465 ; CHECK-NEXT: ubfx r4, r2, #8, #1
466 ; CHECK-NEXT: ubfx r2, r2, #12, #1
467 ; CHECK-NEXT: rsbs r4, r4, #0
468 ; CHECK-NEXT: bfi r3, r4, #2, #1
469 ; CHECK-NEXT: rsbs r2, r2, #0
470 ; CHECK-NEXT: bfi r3, r2, #3, #1
471 ; CHECK-NEXT: lsls r2, r3, #31
472 ; CHECK-NEXT: bne .LBB2_17
473 ; CHECK-NEXT: @ %bb.14: @ %else16
474 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
475 ; CHECK-NEXT: lsls r2, r3, #30
476 ; CHECK-NEXT: bpl .LBB2_18
477 ; CHECK-NEXT: .LBB2_15: @ %cond.load18
478 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
479 ; CHECK-NEXT: vldr.16 s16, [r1, #2]
480 ; CHECK-NEXT: vmov r4, s20
481 ; CHECK-NEXT: vmovx.f16 s20, s21
482 ; CHECK-NEXT: vmov r2, s16
483 ; CHECK-NEXT: vmov.16 q4[0], r4
484 ; CHECK-NEXT: vmov.16 q4[1], r2
485 ; CHECK-NEXT: vmov r2, s21
486 ; CHECK-NEXT: vmov.16 q4[2], r2
487 ; CHECK-NEXT: vmov r2, s20
488 ; CHECK-NEXT: vmov.16 q4[3], r2
489 ; CHECK-NEXT: lsls r2, r3, #29
490 ; CHECK-NEXT: bmi .LBB2_19
491 ; CHECK-NEXT: .LBB2_16: @ in Loop: Header=BB2_4 Depth=1
492 ; CHECK-NEXT: vmov q5, q4
493 ; CHECK-NEXT: lsls r2, r3, #28
494 ; CHECK-NEXT: bmi.w .LBB2_2
495 ; CHECK-NEXT: b .LBB2_20
496 ; CHECK-NEXT: .LBB2_17: @ %cond.load15
497 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
498 ; CHECK-NEXT: vldr.16 s20, [r1]
499 ; CHECK-NEXT: lsls r2, r3, #30
500 ; CHECK-NEXT: bmi .LBB2_15
501 ; CHECK-NEXT: .LBB2_18: @ in Loop: Header=BB2_4 Depth=1
502 ; CHECK-NEXT: vmov q4, q5
503 ; CHECK-NEXT: lsls r2, r3, #29
504 ; CHECK-NEXT: bpl .LBB2_16
505 ; CHECK-NEXT: .LBB2_19: @ %cond.load21
506 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
507 ; CHECK-NEXT: vmovx.f16 s20, s16
508 ; CHECK-NEXT: vmov r2, s16
509 ; CHECK-NEXT: vldr.16 s24, [r1, #4]
510 ; CHECK-NEXT: vmov r4, s20
511 ; CHECK-NEXT: vmov.16 q5[0], r2
512 ; CHECK-NEXT: vmovx.f16 s16, s17
513 ; CHECK-NEXT: vmov.16 q5[1], r4
514 ; CHECK-NEXT: vmov r2, s24
515 ; CHECK-NEXT: vmov.16 q5[2], r2
516 ; CHECK-NEXT: vmov r2, s16
517 ; CHECK-NEXT: vmov.16 q5[3], r2
518 ; CHECK-NEXT: lsls r2, r3, #28
519 ; CHECK-NEXT: bmi.w .LBB2_2
520 ; CHECK-NEXT: .LBB2_20: @ in Loop: Header=BB2_4 Depth=1
521 ; CHECK-NEXT: vmov q4, q5
522 ; CHECK-NEXT: b .LBB2_3
523 ; CHECK-NEXT: .LBB2_21: @ %middle.block
524 ; CHECK-NEXT: vpsel q0, q3, q2
525 ; CHECK-NEXT: vmov.f32 s4, s2
526 ; CHECK-NEXT: vmov.f32 s5, s3
527 ; CHECK-NEXT: vadd.f32 q0, q0, q1
528 ; CHECK-NEXT: vmov.32 r0, q0[1]
529 ; CHECK-NEXT: vdup.32 q1, r0
530 ; CHECK-NEXT: vadd.f32 q0, q0, q1
531 ; CHECK-NEXT: b .LBB2_23
532 ; CHECK-NEXT: .LBB2_22:
533 ; CHECK-NEXT: vldr s0, .LCPI2_0
534 ; CHECK-NEXT: .LBB2_23: @ %for.cond.cleanup
535 ; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0
536 ; CHECK-NEXT: add sp, #8
537 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
538 ; CHECK-NEXT: pop {r4, pc}
539 ; CHECK-NEXT: .p2align 4
540 ; CHECK-NEXT: @ %bb.24:
541 ; CHECK-NEXT: .LCPI2_1:
542 ; CHECK-NEXT: .long 0 @ 0x0
543 ; CHECK-NEXT: .long 1 @ 0x1
544 ; CHECK-NEXT: .long 2 @ 0x2
545 ; CHECK-NEXT: .long 3 @ 0x3
546 ; CHECK-NEXT: .LCPI2_0:
547 ; CHECK-NEXT: .long 0 @ float 0
549 %cmp8 = icmp eq i32 %N, 0
550 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
552 vector.ph: ; preds = %entry
553 %n.rnd.up = add i32 %N, 3
554 %n.vec = and i32 %n.rnd.up, -4
555 %trip.count.minus.1 = add i32 %N, -1
556 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
557 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
558 br label %vector.body
560 vector.body: ; preds = %vector.body, %vector.ph
561 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
562 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %7, %vector.body ]
563 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
564 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
565 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
566 %0 = getelementptr inbounds half, half* %b, i32 %index
567 %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
568 %2 = bitcast half* %0 to <4 x half>*
569 %wide.masked.load = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %2, i32 2, <4 x i1> %1, <4 x half> undef)
570 %3 = getelementptr inbounds half, half* %c, i32 %index
571 %4 = bitcast half* %3 to <4 x half>*
572 %wide.masked.load13 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %4, i32 2, <4 x i1> %1, <4 x half> undef)
573 %5 = fmul fast <4 x half> %wide.masked.load13, %wide.masked.load
574 %6 = fpext <4 x half> %5 to <4 x float>
575 %7 = fadd fast <4 x float> %vec.phi, %6
576 %index.next = add i32 %index, 4
577 %8 = icmp eq i32 %index.next, %n.vec
578 br i1 %8, label %middle.block, label %vector.body
580 middle.block: ; preds = %vector.body
581 %9 = select <4 x i1> %1, <4 x float> %7, <4 x float> %vec.phi
582 %rdx.shuf = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
583 %bin.rdx = fadd fast <4 x float> %9, %rdx.shuf
584 %rdx.shuf14 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
585 %bin.rdx15 = fadd fast <4 x float> %bin.rdx, %rdx.shuf14
586 %10 = extractelement <4 x float> %bin.rdx15, i32 0
587 br label %for.cond.cleanup
589 for.cond.cleanup: ; preds = %middle.block, %entry
590 %a.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %10, %middle.block ]
594 ; Function Attrs: argmemonly nounwind readonly willreturn
595 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
597 ; Function Attrs: argmemonly nounwind willreturn
598 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
600 ; Function Attrs: argmemonly nounwind readonly willreturn
601 declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32 immarg, <4 x i1>, <4 x half>)