1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
5 ; CHECK-LABEL: float_float_mul:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
8 ; CHECK-NEXT: cmp r3, #0
9 ; CHECK-NEXT: beq.w .LBB0_10
10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
11 ; CHECK-NEXT: cmp r3, #3
12 ; CHECK-NEXT: bhi .LBB0_3
13 ; CHECK-NEXT: @ %bb.2:
14 ; CHECK-NEXT: mov.w r12, #0
15 ; CHECK-NEXT: b .LBB0_4
16 ; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
17 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
18 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
19 ; CHECK-NEXT: cmp r5, r1
20 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
21 ; CHECK-NEXT: cset r7, hi
22 ; CHECK-NEXT: cmp r6, r2
23 ; CHECK-NEXT: cset r6, hi
24 ; CHECK-NEXT: cmp r5, r0
25 ; CHECK-NEXT: cset r5, hi
26 ; CHECK-NEXT: cmp r4, r2
27 ; CHECK-NEXT: cset r4, hi
28 ; CHECK-NEXT: mov.w r12, #0
29 ; CHECK-NEXT: ands r5, r4
30 ; CHECK-NEXT: lsls r5, r5, #31
32 ; CHECK-NEXT: andeq r7, r6
33 ; CHECK-NEXT: lslseq.w r7, r7, #31
34 ; CHECK-NEXT: beq .LBB0_11
35 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
36 ; CHECK-NEXT: mvn.w r7, r12
37 ; CHECK-NEXT: adds r5, r7, r3
38 ; CHECK-NEXT: and lr, r3, #3
39 ; CHECK-NEXT: wls lr, lr, .LBB0_7
40 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
41 ; CHECK-NEXT: mvn r4, #3
42 ; CHECK-NEXT: add.w r7, r4, r12, lsl #2
43 ; CHECK-NEXT: adds r4, r0, r7
44 ; CHECK-NEXT: adds r6, r1, r7
45 ; CHECK-NEXT: add r7, r2
46 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol
47 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
48 ; CHECK-NEXT: vldr s0, [r6, #4]
49 ; CHECK-NEXT: adds r6, #4
50 ; CHECK-NEXT: vldr s2, [r4, #4]
51 ; CHECK-NEXT: adds r4, #4
52 ; CHECK-NEXT: add.w r12, r12, #1
53 ; CHECK-NEXT: vmul.f32 s0, s2, s0
54 ; CHECK-NEXT: vstr s0, [r7, #4]
55 ; CHECK-NEXT: adds r7, #4
56 ; CHECK-NEXT: le lr, .LBB0_6
57 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
58 ; CHECK-NEXT: cmp r5, #3
59 ; CHECK-NEXT: blo .LBB0_10
60 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
61 ; CHECK-NEXT: sub.w lr, r3, r12
62 ; CHECK-NEXT: sub.w r8, r0, #8
63 ; CHECK-NEXT: sub.w r10, r1, #8
64 ; CHECK-NEXT: sub.w r5, r2, #8
65 ; CHECK-NEXT: subs r0, #4
66 ; CHECK-NEXT: subs r1, #4
67 ; CHECK-NEXT: subs r2, #4
68 ; CHECK-NEXT: lsl.w r12, r12, #2
69 ; CHECK-NEXT: .LBB0_9: @ %for.body
70 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
71 ; CHECK-NEXT: add.w r4, r0, r12
72 ; CHECK-NEXT: add.w r6, r1, r12
73 ; CHECK-NEXT: add.w r9, r2, r12
74 ; CHECK-NEXT: add.w r7, r8, r12
75 ; CHECK-NEXT: vldr s0, [r6, #4]
76 ; CHECK-NEXT: add.w r3, r10, r12
77 ; CHECK-NEXT: vldr s2, [r4, #4]
78 ; CHECK-NEXT: add.w r11, r5, r12
79 ; CHECK-NEXT: add.w r8, r8, #16
80 ; CHECK-NEXT: add.w r10, r10, #16
81 ; CHECK-NEXT: vmul.f32 s0, s2, s0
82 ; CHECK-NEXT: adds r5, #16
83 ; CHECK-NEXT: adds r0, #16
84 ; CHECK-NEXT: adds r1, #16
85 ; CHECK-NEXT: adds r2, #16
86 ; CHECK-NEXT: subs.w lr, lr, #4
87 ; CHECK-NEXT: vstr s0, [r9, #4]
88 ; CHECK-NEXT: vldr s0, [r3, #12]
89 ; CHECK-NEXT: vldr s2, [r7, #12]
90 ; CHECK-NEXT: vmul.f32 s0, s2, s0
91 ; CHECK-NEXT: vstr s0, [r11, #12]
92 ; CHECK-NEXT: vldr s0, [r3, #16]
93 ; CHECK-NEXT: vldr s2, [r7, #16]
94 ; CHECK-NEXT: vmul.f32 s0, s2, s0
95 ; CHECK-NEXT: vstr s0, [r11, #16]
96 ; CHECK-NEXT: vldr s0, [r6, #16]
97 ; CHECK-NEXT: vldr s2, [r4, #16]
98 ; CHECK-NEXT: vmul.f32 s0, s2, s0
99 ; CHECK-NEXT: vstr s0, [r9, #16]
100 ; CHECK-NEXT: bne .LBB0_9
101 ; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup
102 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
103 ; CHECK-NEXT: .LBB0_11: @ %vector.ph
104 ; CHECK-NEXT: bic r12, r3, #3
105 ; CHECK-NEXT: movs r6, #1
106 ; CHECK-NEXT: sub.w r7, r12, #4
107 ; CHECK-NEXT: sub.w r4, r0, #16
108 ; CHECK-NEXT: sub.w r5, r1, #16
109 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
110 ; CHECK-NEXT: sub.w r6, r2, #16
111 ; CHECK-NEXT: dls lr, lr
112 ; CHECK-NEXT: .LBB0_12: @ %vector.body
113 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
114 ; CHECK-NEXT: vldrw.u32 q0, [r5, #16]!
115 ; CHECK-NEXT: vldrw.u32 q1, [r4, #16]!
116 ; CHECK-NEXT: vmul.f32 q0, q1, q0
117 ; CHECK-NEXT: vstrb.8 q0, [r6, #16]!
118 ; CHECK-NEXT: le lr, .LBB0_12
119 ; CHECK-NEXT: @ %bb.13: @ %middle.block
120 ; CHECK-NEXT: cmp r12, r3
121 ; CHECK-NEXT: bne.w .LBB0_4
122 ; CHECK-NEXT: b .LBB0_10
124 %cmp8 = icmp eq i32 %N, 0
125 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
127 for.body.preheader: ; preds = %entry
128 %min.iters.check = icmp ult i32 %N, 4
129 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
131 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
132 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
133 %0 = xor i32 %i.09.ph, -1
135 %xtraiter = and i32 %N, 3
136 %lcmp.mod = icmp eq i32 %xtraiter, 0
137 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
139 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
140 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
141 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
142 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
143 %2 = load float, float* %arrayidx.prol, align 4
144 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
145 %3 = load float, float* %arrayidx1.prol, align 4
146 %mul.prol = fmul float %2, %3
147 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
148 store float %mul.prol, float* %arrayidx2.prol, align 4
149 %inc.prol = add nuw i32 %i.09.prol, 1
150 %prol.iter.sub = add i32 %prol.iter, -1
151 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
152 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
154 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
155 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
156 %4 = icmp ult i32 %1, 3
157 br i1 %4, label %for.cond.cleanup, label %for.body
159 vector.memcheck: ; preds = %for.body.preheader
160 %scevgep = getelementptr float, float* %c, i32 %N
161 %scevgep13 = getelementptr float, float* %a, i32 %N
162 %scevgep16 = getelementptr float, float* %b, i32 %N
163 %bound0 = icmp ugt float* %scevgep13, %c
164 %bound1 = icmp ugt float* %scevgep, %a
165 %found.conflict = and i1 %bound0, %bound1
166 %bound018 = icmp ugt float* %scevgep16, %c
167 %bound119 = icmp ugt float* %scevgep, %b
168 %found.conflict20 = and i1 %bound018, %bound119
169 %conflict.rdx = or i1 %found.conflict, %found.conflict20
170 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
172 vector.ph: ; preds = %vector.memcheck
173 %n.vec = and i32 %N, -4
174 br label %vector.body
176 vector.body: ; preds = %vector.body, %vector.ph
177 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
178 %5 = getelementptr inbounds float, float* %a, i32 %index
179 %6 = bitcast float* %5 to <4 x float>*
180 %wide.load = load <4 x float>, <4 x float>* %6, align 4
181 %7 = getelementptr inbounds float, float* %b, i32 %index
182 %8 = bitcast float* %7 to <4 x float>*
183 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
184 %9 = fmul <4 x float> %wide.load, %wide.load21
185 %10 = getelementptr inbounds float, float* %c, i32 %index
186 %11 = bitcast float* %10 to <4 x float>*
187 store <4 x float> %9, <4 x float>* %11, align 4
188 %index.next = add i32 %index, 4
189 %12 = icmp eq i32 %index.next, %n.vec
190 br i1 %12, label %middle.block, label %vector.body
192 middle.block: ; preds = %vector.body
193 %cmp.n = icmp eq i32 %n.vec, %N
194 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
196 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
199 for.body: ; preds = %for.body.prol.loopexit, %for.body
200 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
201 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
202 %13 = load float, float* %arrayidx, align 4
203 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
204 %14 = load float, float* %arrayidx1, align 4
205 %mul = fmul float %13, %14
206 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
207 store float %mul, float* %arrayidx2, align 4
208 %inc = add nuw i32 %i.09, 1
209 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
210 %15 = load float, float* %arrayidx.1, align 4
211 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
212 %16 = load float, float* %arrayidx1.1, align 4
213 %mul.1 = fmul float %15, %16
214 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
215 store float %mul.1, float* %arrayidx2.1, align 4
216 %inc.1 = add nuw i32 %i.09, 2
217 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
218 %17 = load float, float* %arrayidx.2, align 4
219 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
220 %18 = load float, float* %arrayidx1.2, align 4
221 %mul.2 = fmul float %17, %18
222 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
223 store float %mul.2, float* %arrayidx2.2, align 4
224 %inc.2 = add nuw i32 %i.09, 3
225 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
226 %19 = load float, float* %arrayidx.3, align 4
227 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
228 %20 = load float, float* %arrayidx1.3, align 4
229 %mul.3 = fmul float %19, %20
230 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
231 store float %mul.3, float* %arrayidx2.3, align 4
232 %inc.3 = add nuw i32 %i.09, 4
233 %exitcond.3 = icmp eq i32 %inc.3, %N
234 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
237 define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
238 ; CHECK-LABEL: float_float_add:
239 ; CHECK: @ %bb.0: @ %entry
240 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
241 ; CHECK-NEXT: cmp r3, #0
242 ; CHECK-NEXT: beq.w .LBB1_10
243 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
244 ; CHECK-NEXT: cmp r3, #3
245 ; CHECK-NEXT: bhi .LBB1_3
246 ; CHECK-NEXT: @ %bb.2:
247 ; CHECK-NEXT: mov.w r12, #0
248 ; CHECK-NEXT: b .LBB1_4
249 ; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
250 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
251 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
252 ; CHECK-NEXT: cmp r5, r1
253 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
254 ; CHECK-NEXT: cset r7, hi
255 ; CHECK-NEXT: cmp r6, r2
256 ; CHECK-NEXT: cset r6, hi
257 ; CHECK-NEXT: cmp r5, r0
258 ; CHECK-NEXT: cset r5, hi
259 ; CHECK-NEXT: cmp r4, r2
260 ; CHECK-NEXT: cset r4, hi
261 ; CHECK-NEXT: mov.w r12, #0
262 ; CHECK-NEXT: ands r5, r4
263 ; CHECK-NEXT: lsls r5, r5, #31
265 ; CHECK-NEXT: andeq r7, r6
266 ; CHECK-NEXT: lslseq.w r7, r7, #31
267 ; CHECK-NEXT: beq .LBB1_11
268 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
269 ; CHECK-NEXT: mvn.w r7, r12
270 ; CHECK-NEXT: adds r5, r7, r3
271 ; CHECK-NEXT: and lr, r3, #3
272 ; CHECK-NEXT: wls lr, lr, .LBB1_7
273 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
274 ; CHECK-NEXT: mvn r4, #3
275 ; CHECK-NEXT: add.w r7, r4, r12, lsl #2
276 ; CHECK-NEXT: adds r4, r0, r7
277 ; CHECK-NEXT: adds r6, r1, r7
278 ; CHECK-NEXT: add r7, r2
279 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol
280 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
281 ; CHECK-NEXT: vldr s0, [r6, #4]
282 ; CHECK-NEXT: adds r6, #4
283 ; CHECK-NEXT: vldr s2, [r4, #4]
284 ; CHECK-NEXT: adds r4, #4
285 ; CHECK-NEXT: add.w r12, r12, #1
286 ; CHECK-NEXT: vadd.f32 s0, s2, s0
287 ; CHECK-NEXT: vstr s0, [r7, #4]
288 ; CHECK-NEXT: adds r7, #4
289 ; CHECK-NEXT: le lr, .LBB1_6
290 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
291 ; CHECK-NEXT: cmp r5, #3
292 ; CHECK-NEXT: blo .LBB1_10
293 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
294 ; CHECK-NEXT: sub.w lr, r3, r12
295 ; CHECK-NEXT: sub.w r8, r0, #8
296 ; CHECK-NEXT: sub.w r10, r1, #8
297 ; CHECK-NEXT: sub.w r5, r2, #8
298 ; CHECK-NEXT: subs r0, #4
299 ; CHECK-NEXT: subs r1, #4
300 ; CHECK-NEXT: subs r2, #4
301 ; CHECK-NEXT: lsl.w r12, r12, #2
302 ; CHECK-NEXT: .LBB1_9: @ %for.body
303 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
304 ; CHECK-NEXT: add.w r4, r0, r12
305 ; CHECK-NEXT: add.w r6, r1, r12
306 ; CHECK-NEXT: add.w r9, r2, r12
307 ; CHECK-NEXT: add.w r7, r8, r12
308 ; CHECK-NEXT: vldr s0, [r6, #4]
309 ; CHECK-NEXT: add.w r3, r10, r12
310 ; CHECK-NEXT: vldr s2, [r4, #4]
311 ; CHECK-NEXT: add.w r11, r5, r12
312 ; CHECK-NEXT: add.w r8, r8, #16
313 ; CHECK-NEXT: add.w r10, r10, #16
314 ; CHECK-NEXT: vadd.f32 s0, s2, s0
315 ; CHECK-NEXT: adds r5, #16
316 ; CHECK-NEXT: adds r0, #16
317 ; CHECK-NEXT: adds r1, #16
318 ; CHECK-NEXT: adds r2, #16
319 ; CHECK-NEXT: subs.w lr, lr, #4
320 ; CHECK-NEXT: vstr s0, [r9, #4]
321 ; CHECK-NEXT: vldr s0, [r3, #12]
322 ; CHECK-NEXT: vldr s2, [r7, #12]
323 ; CHECK-NEXT: vadd.f32 s0, s2, s0
324 ; CHECK-NEXT: vstr s0, [r11, #12]
325 ; CHECK-NEXT: vldr s0, [r3, #16]
326 ; CHECK-NEXT: vldr s2, [r7, #16]
327 ; CHECK-NEXT: vadd.f32 s0, s2, s0
328 ; CHECK-NEXT: vstr s0, [r11, #16]
329 ; CHECK-NEXT: vldr s0, [r6, #16]
330 ; CHECK-NEXT: vldr s2, [r4, #16]
331 ; CHECK-NEXT: vadd.f32 s0, s2, s0
332 ; CHECK-NEXT: vstr s0, [r9, #16]
333 ; CHECK-NEXT: bne .LBB1_9
334 ; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup
335 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
336 ; CHECK-NEXT: .LBB1_11: @ %vector.ph
337 ; CHECK-NEXT: bic r12, r3, #3
338 ; CHECK-NEXT: movs r6, #1
339 ; CHECK-NEXT: sub.w r7, r12, #4
340 ; CHECK-NEXT: sub.w r4, r0, #16
341 ; CHECK-NEXT: sub.w r5, r1, #16
342 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
343 ; CHECK-NEXT: sub.w r6, r2, #16
344 ; CHECK-NEXT: dls lr, lr
345 ; CHECK-NEXT: .LBB1_12: @ %vector.body
346 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
347 ; CHECK-NEXT: vldrw.u32 q0, [r5, #16]!
348 ; CHECK-NEXT: vldrw.u32 q1, [r4, #16]!
349 ; CHECK-NEXT: vadd.f32 q0, q1, q0
350 ; CHECK-NEXT: vstrb.8 q0, [r6, #16]!
351 ; CHECK-NEXT: le lr, .LBB1_12
352 ; CHECK-NEXT: @ %bb.13: @ %middle.block
353 ; CHECK-NEXT: cmp r12, r3
354 ; CHECK-NEXT: bne.w .LBB1_4
355 ; CHECK-NEXT: b .LBB1_10
357 %cmp8 = icmp eq i32 %N, 0
358 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
360 for.body.preheader: ; preds = %entry
361 %min.iters.check = icmp ult i32 %N, 4
362 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
364 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
365 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
366 %0 = xor i32 %i.09.ph, -1
368 %xtraiter = and i32 %N, 3
369 %lcmp.mod = icmp eq i32 %xtraiter, 0
370 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
372 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
373 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
374 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
375 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
376 %2 = load float, float* %arrayidx.prol, align 4
377 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
378 %3 = load float, float* %arrayidx1.prol, align 4
379 %add.prol = fadd float %2, %3
380 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
381 store float %add.prol, float* %arrayidx2.prol, align 4
382 %inc.prol = add nuw i32 %i.09.prol, 1
383 %prol.iter.sub = add i32 %prol.iter, -1
384 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
385 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
387 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
388 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
389 %4 = icmp ult i32 %1, 3
390 br i1 %4, label %for.cond.cleanup, label %for.body
392 vector.memcheck: ; preds = %for.body.preheader
393 %scevgep = getelementptr float, float* %c, i32 %N
394 %scevgep13 = getelementptr float, float* %a, i32 %N
395 %scevgep16 = getelementptr float, float* %b, i32 %N
396 %bound0 = icmp ugt float* %scevgep13, %c
397 %bound1 = icmp ugt float* %scevgep, %a
398 %found.conflict = and i1 %bound0, %bound1
399 %bound018 = icmp ugt float* %scevgep16, %c
400 %bound119 = icmp ugt float* %scevgep, %b
401 %found.conflict20 = and i1 %bound018, %bound119
402 %conflict.rdx = or i1 %found.conflict, %found.conflict20
403 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
405 vector.ph: ; preds = %vector.memcheck
406 %n.vec = and i32 %N, -4
407 br label %vector.body
409 vector.body: ; preds = %vector.body, %vector.ph
410 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
411 %5 = getelementptr inbounds float, float* %a, i32 %index
412 %6 = bitcast float* %5 to <4 x float>*
413 %wide.load = load <4 x float>, <4 x float>* %6, align 4
414 %7 = getelementptr inbounds float, float* %b, i32 %index
415 %8 = bitcast float* %7 to <4 x float>*
416 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
417 %9 = fadd <4 x float> %wide.load, %wide.load21
418 %10 = getelementptr inbounds float, float* %c, i32 %index
419 %11 = bitcast float* %10 to <4 x float>*
420 store <4 x float> %9, <4 x float>* %11, align 4
421 %index.next = add i32 %index, 4
422 %12 = icmp eq i32 %index.next, %n.vec
423 br i1 %12, label %middle.block, label %vector.body
425 middle.block: ; preds = %vector.body
426 %cmp.n = icmp eq i32 %n.vec, %N
427 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
429 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
432 for.body: ; preds = %for.body.prol.loopexit, %for.body
433 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
434 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
435 %13 = load float, float* %arrayidx, align 4
436 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
437 %14 = load float, float* %arrayidx1, align 4
438 %add = fadd float %13, %14
439 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
440 store float %add, float* %arrayidx2, align 4
441 %inc = add nuw i32 %i.09, 1
442 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
443 %15 = load float, float* %arrayidx.1, align 4
444 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
445 %16 = load float, float* %arrayidx1.1, align 4
446 %add.1 = fadd float %15, %16
447 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
448 store float %add.1, float* %arrayidx2.1, align 4
449 %inc.1 = add nuw i32 %i.09, 2
450 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
451 %17 = load float, float* %arrayidx.2, align 4
452 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
453 %18 = load float, float* %arrayidx1.2, align 4
454 %add.2 = fadd float %17, %18
455 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
456 store float %add.2, float* %arrayidx2.2, align 4
457 %inc.2 = add nuw i32 %i.09, 3
458 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
459 %19 = load float, float* %arrayidx.3, align 4
460 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
461 %20 = load float, float* %arrayidx1.3, align 4
462 %add.3 = fadd float %19, %20
463 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
464 store float %add.3, float* %arrayidx2.3, align 4
465 %inc.3 = add nuw i32 %i.09, 4
466 %exitcond.3 = icmp eq i32 %inc.3, %N
467 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
470 define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
471 ; CHECK-LABEL: float_float_sub:
472 ; CHECK: @ %bb.0: @ %entry
473 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
474 ; CHECK-NEXT: cmp r3, #0
475 ; CHECK-NEXT: beq.w .LBB2_10
476 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
477 ; CHECK-NEXT: cmp r3, #3
478 ; CHECK-NEXT: bhi .LBB2_3
479 ; CHECK-NEXT: @ %bb.2:
480 ; CHECK-NEXT: mov.w r12, #0
481 ; CHECK-NEXT: b .LBB2_4
482 ; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
483 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
484 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
485 ; CHECK-NEXT: cmp r5, r1
486 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
487 ; CHECK-NEXT: cset r7, hi
488 ; CHECK-NEXT: cmp r6, r2
489 ; CHECK-NEXT: cset r6, hi
490 ; CHECK-NEXT: cmp r5, r0
491 ; CHECK-NEXT: cset r5, hi
492 ; CHECK-NEXT: cmp r4, r2
493 ; CHECK-NEXT: cset r4, hi
494 ; CHECK-NEXT: mov.w r12, #0
495 ; CHECK-NEXT: ands r5, r4
496 ; CHECK-NEXT: lsls r5, r5, #31
498 ; CHECK-NEXT: andeq r7, r6
499 ; CHECK-NEXT: lslseq.w r7, r7, #31
500 ; CHECK-NEXT: beq .LBB2_11
501 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
502 ; CHECK-NEXT: mvn.w r7, r12
503 ; CHECK-NEXT: adds r5, r7, r3
504 ; CHECK-NEXT: and lr, r3, #3
505 ; CHECK-NEXT: wls lr, lr, .LBB2_7
506 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
507 ; CHECK-NEXT: mvn r4, #3
508 ; CHECK-NEXT: add.w r7, r4, r12, lsl #2
509 ; CHECK-NEXT: adds r4, r0, r7
510 ; CHECK-NEXT: adds r6, r1, r7
511 ; CHECK-NEXT: add r7, r2
512 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol
513 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
514 ; CHECK-NEXT: vldr s0, [r6, #4]
515 ; CHECK-NEXT: adds r6, #4
516 ; CHECK-NEXT: vldr s2, [r4, #4]
517 ; CHECK-NEXT: adds r4, #4
518 ; CHECK-NEXT: add.w r12, r12, #1
519 ; CHECK-NEXT: vsub.f32 s0, s2, s0
520 ; CHECK-NEXT: vstr s0, [r7, #4]
521 ; CHECK-NEXT: adds r7, #4
522 ; CHECK-NEXT: le lr, .LBB2_6
523 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
524 ; CHECK-NEXT: cmp r5, #3
525 ; CHECK-NEXT: blo .LBB2_10
526 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
527 ; CHECK-NEXT: sub.w lr, r3, r12
528 ; CHECK-NEXT: sub.w r8, r0, #8
529 ; CHECK-NEXT: sub.w r10, r1, #8
530 ; CHECK-NEXT: sub.w r5, r2, #8
531 ; CHECK-NEXT: subs r0, #4
532 ; CHECK-NEXT: subs r1, #4
533 ; CHECK-NEXT: subs r2, #4
534 ; CHECK-NEXT: lsl.w r12, r12, #2
535 ; CHECK-NEXT: .LBB2_9: @ %for.body
536 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
537 ; CHECK-NEXT: add.w r4, r0, r12
538 ; CHECK-NEXT: add.w r6, r1, r12
539 ; CHECK-NEXT: add.w r9, r2, r12
540 ; CHECK-NEXT: add.w r7, r8, r12
541 ; CHECK-NEXT: vldr s0, [r6, #4]
542 ; CHECK-NEXT: add.w r3, r10, r12
543 ; CHECK-NEXT: vldr s2, [r4, #4]
544 ; CHECK-NEXT: add.w r11, r5, r12
545 ; CHECK-NEXT: add.w r8, r8, #16
546 ; CHECK-NEXT: add.w r10, r10, #16
547 ; CHECK-NEXT: vsub.f32 s0, s2, s0
548 ; CHECK-NEXT: adds r5, #16
549 ; CHECK-NEXT: adds r0, #16
550 ; CHECK-NEXT: adds r1, #16
551 ; CHECK-NEXT: adds r2, #16
552 ; CHECK-NEXT: subs.w lr, lr, #4
553 ; CHECK-NEXT: vstr s0, [r9, #4]
554 ; CHECK-NEXT: vldr s0, [r3, #12]
555 ; CHECK-NEXT: vldr s2, [r7, #12]
556 ; CHECK-NEXT: vsub.f32 s0, s2, s0
557 ; CHECK-NEXT: vstr s0, [r11, #12]
558 ; CHECK-NEXT: vldr s0, [r3, #16]
559 ; CHECK-NEXT: vldr s2, [r7, #16]
560 ; CHECK-NEXT: vsub.f32 s0, s2, s0
561 ; CHECK-NEXT: vstr s0, [r11, #16]
562 ; CHECK-NEXT: vldr s0, [r6, #16]
563 ; CHECK-NEXT: vldr s2, [r4, #16]
564 ; CHECK-NEXT: vsub.f32 s0, s2, s0
565 ; CHECK-NEXT: vstr s0, [r9, #16]
566 ; CHECK-NEXT: bne .LBB2_9
567 ; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup
568 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
569 ; CHECK-NEXT: .LBB2_11: @ %vector.ph
570 ; CHECK-NEXT: bic r12, r3, #3
571 ; CHECK-NEXT: movs r6, #1
572 ; CHECK-NEXT: sub.w r7, r12, #4
573 ; CHECK-NEXT: sub.w r4, r0, #16
574 ; CHECK-NEXT: sub.w r5, r1, #16
575 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
576 ; CHECK-NEXT: sub.w r6, r2, #16
577 ; CHECK-NEXT: dls lr, lr
578 ; CHECK-NEXT: .LBB2_12: @ %vector.body
579 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
580 ; CHECK-NEXT: vldrw.u32 q0, [r5, #16]!
581 ; CHECK-NEXT: vldrw.u32 q1, [r4, #16]!
582 ; CHECK-NEXT: vsub.f32 q0, q1, q0
583 ; CHECK-NEXT: vstrb.8 q0, [r6, #16]!
584 ; CHECK-NEXT: le lr, .LBB2_12
585 ; CHECK-NEXT: @ %bb.13: @ %middle.block
586 ; CHECK-NEXT: cmp r12, r3
587 ; CHECK-NEXT: bne.w .LBB2_4
588 ; CHECK-NEXT: b .LBB2_10
590 %cmp8 = icmp eq i32 %N, 0
591 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
593 for.body.preheader: ; preds = %entry
594 %min.iters.check = icmp ult i32 %N, 4
595 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
597 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
598 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
599 %0 = xor i32 %i.09.ph, -1
601 %xtraiter = and i32 %N, 3
602 %lcmp.mod = icmp eq i32 %xtraiter, 0
603 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
605 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
606 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
607 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
608 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
609 %2 = load float, float* %arrayidx.prol, align 4
610 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
611 %3 = load float, float* %arrayidx1.prol, align 4
612 %sub.prol = fsub float %2, %3
613 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
614 store float %sub.prol, float* %arrayidx2.prol, align 4
615 %inc.prol = add nuw i32 %i.09.prol, 1
616 %prol.iter.sub = add i32 %prol.iter, -1
617 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
618 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
620 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
621 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
622 %4 = icmp ult i32 %1, 3
623 br i1 %4, label %for.cond.cleanup, label %for.body
625 vector.memcheck: ; preds = %for.body.preheader
626 %scevgep = getelementptr float, float* %c, i32 %N
627 %scevgep13 = getelementptr float, float* %a, i32 %N
628 %scevgep16 = getelementptr float, float* %b, i32 %N
629 %bound0 = icmp ugt float* %scevgep13, %c
630 %bound1 = icmp ugt float* %scevgep, %a
631 %found.conflict = and i1 %bound0, %bound1
632 %bound018 = icmp ugt float* %scevgep16, %c
633 %bound119 = icmp ugt float* %scevgep, %b
634 %found.conflict20 = and i1 %bound018, %bound119
635 %conflict.rdx = or i1 %found.conflict, %found.conflict20
636 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
638 vector.ph: ; preds = %vector.memcheck
639 %n.vec = and i32 %N, -4
640 br label %vector.body
642 vector.body: ; preds = %vector.body, %vector.ph
643 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
644 %5 = getelementptr inbounds float, float* %a, i32 %index
645 %6 = bitcast float* %5 to <4 x float>*
646 %wide.load = load <4 x float>, <4 x float>* %6, align 4
647 %7 = getelementptr inbounds float, float* %b, i32 %index
648 %8 = bitcast float* %7 to <4 x float>*
649 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
650 %9 = fsub <4 x float> %wide.load, %wide.load21
651 %10 = getelementptr inbounds float, float* %c, i32 %index
652 %11 = bitcast float* %10 to <4 x float>*
653 store <4 x float> %9, <4 x float>* %11, align 4
654 %index.next = add i32 %index, 4
655 %12 = icmp eq i32 %index.next, %n.vec
656 br i1 %12, label %middle.block, label %vector.body
658 middle.block: ; preds = %vector.body
659 %cmp.n = icmp eq i32 %n.vec, %N
660 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
662 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
665 for.body: ; preds = %for.body.prol.loopexit, %for.body
666 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
667 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
668 %13 = load float, float* %arrayidx, align 4
669 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
670 %14 = load float, float* %arrayidx1, align 4
671 %sub = fsub float %13, %14
672 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
673 store float %sub, float* %arrayidx2, align 4
674 %inc = add nuw i32 %i.09, 1
675 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
676 %15 = load float, float* %arrayidx.1, align 4
677 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
678 %16 = load float, float* %arrayidx1.1, align 4
679 %sub.1 = fsub float %15, %16
680 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
681 store float %sub.1, float* %arrayidx2.1, align 4
682 %inc.1 = add nuw i32 %i.09, 2
683 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
684 %17 = load float, float* %arrayidx.2, align 4
685 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
686 %18 = load float, float* %arrayidx1.2, align 4
687 %sub.2 = fsub float %17, %18
688 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
689 store float %sub.2, float* %arrayidx2.2, align 4
690 %inc.2 = add nuw i32 %i.09, 3
691 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
692 %19 = load float, float* %arrayidx.3, align 4
693 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
694 %20 = load float, float* %arrayidx1.3, align 4
695 %sub.3 = fsub float %19, %20
696 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
697 store float %sub.3, float* %arrayidx2.3, align 4
698 %inc.3 = add nuw i32 %i.09, 4
699 %exitcond.3 = icmp eq i32 %inc.3, %N
700 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
703 define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
704 ; CHECK-LABEL: float_int_mul:
705 ; CHECK: @ %bb.0: @ %entry
706 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
707 ; CHECK-NEXT: cmp r3, #0
708 ; CHECK-NEXT: beq.w .LBB3_13
709 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
710 ; CHECK-NEXT: cmp r3, #3
711 ; CHECK-NEXT: bls .LBB3_6
712 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
713 ; CHECK-NEXT: add.w r7, r0, r3, lsl #2
714 ; CHECK-NEXT: cmp r7, r2
716 ; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
717 ; CHECK-NEXT: cmphi r7, r0
718 ; CHECK-NEXT: bhi .LBB3_6
719 ; CHECK-NEXT: @ %bb.3: @ %vector.ph
720 ; CHECK-NEXT: bic r12, r3, #3
721 ; CHECK-NEXT: movs r6, #1
722 ; CHECK-NEXT: sub.w r7, r12, #4
723 ; CHECK-NEXT: sub.w r4, r0, #16
724 ; CHECK-NEXT: sub.w r5, r1, #16
725 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
726 ; CHECK-NEXT: sub.w r6, r2, #16
727 ; CHECK-NEXT: dls lr, lr
728 ; CHECK-NEXT: .LBB3_4: @ %vector.body
729 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
730 ; CHECK-NEXT: vldrw.u32 q0, [r5, #16]!
731 ; CHECK-NEXT: vldrw.u32 q1, [r4, #16]!
732 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
733 ; CHECK-NEXT: vmul.f32 q0, q1, q0
734 ; CHECK-NEXT: vstrb.8 q0, [r6, #16]!
735 ; CHECK-NEXT: le lr, .LBB3_4
736 ; CHECK-NEXT: @ %bb.5: @ %middle.block
737 ; CHECK-NEXT: cmp r12, r3
738 ; CHECK-NEXT: bne .LBB3_7
739 ; CHECK-NEXT: b .LBB3_13
740 ; CHECK-NEXT: .LBB3_6:
741 ; CHECK-NEXT: mov.w r12, #0
742 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
743 ; CHECK-NEXT: mvn.w r6, r12
744 ; CHECK-NEXT: adds r5, r6, r3
745 ; CHECK-NEXT: and lr, r3, #3
746 ; CHECK-NEXT: wls lr, lr, .LBB3_10
747 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
748 ; CHECK-NEXT: mvn r4, #3
749 ; CHECK-NEXT: add.w r6, r4, r12, lsl #2
750 ; CHECK-NEXT: adds r4, r0, r6
751 ; CHECK-NEXT: add r6, r2
752 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol
753 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
754 ; CHECK-NEXT: add.w r7, r1, r12, lsl #2
755 ; CHECK-NEXT: add.w r12, r12, #1
756 ; CHECK-NEXT: vldr s0, [r7]
757 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
758 ; CHECK-NEXT: vldr s2, [r4, #4]
759 ; CHECK-NEXT: adds r4, #4
760 ; CHECK-NEXT: vmul.f32 s0, s2, s0
761 ; CHECK-NEXT: vstr s0, [r6, #4]
762 ; CHECK-NEXT: adds r6, #4
763 ; CHECK-NEXT: le lr, .LBB3_9
764 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
765 ; CHECK-NEXT: cmp r5, #3
766 ; CHECK-NEXT: blo .LBB3_13
767 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
768 ; CHECK-NEXT: sub.w lr, r3, r12
769 ; CHECK-NEXT: sub.w r10, r0, #8
770 ; CHECK-NEXT: sub.w r4, r1, #8
771 ; CHECK-NEXT: sub.w r5, r2, #8
772 ; CHECK-NEXT: subs r0, #4
773 ; CHECK-NEXT: subs r1, #4
774 ; CHECK-NEXT: subs r2, #4
775 ; CHECK-NEXT: lsl.w r12, r12, #2
776 ; CHECK-NEXT: .LBB3_12: @ %for.body
777 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
778 ; CHECK-NEXT: add.w r3, r1, r12
779 ; CHECK-NEXT: add.w r11, r0, r12
780 ; CHECK-NEXT: add.w r9, r2, r12
781 ; CHECK-NEXT: add.w r6, r4, r12
782 ; CHECK-NEXT: vldr s0, [r3, #4]
783 ; CHECK-NEXT: add.w r7, r10, r12
784 ; CHECK-NEXT: add.w r8, r5, r12
785 ; CHECK-NEXT: add.w r10, r10, #16
786 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
787 ; CHECK-NEXT: vldr s2, [r11, #4]
788 ; CHECK-NEXT: adds r4, #16
789 ; CHECK-NEXT: adds r5, #16
790 ; CHECK-NEXT: adds r0, #16
791 ; CHECK-NEXT: adds r1, #16
792 ; CHECK-NEXT: adds r2, #16
793 ; CHECK-NEXT: subs.w lr, lr, #4
794 ; CHECK-NEXT: vmul.f32 s0, s2, s0
795 ; CHECK-NEXT: vstr s0, [r9, #4]
796 ; CHECK-NEXT: vldr s0, [r6, #12]
797 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
798 ; CHECK-NEXT: vldr s2, [r7, #12]
799 ; CHECK-NEXT: vmul.f32 s0, s2, s0
800 ; CHECK-NEXT: vstr s0, [r8, #12]
801 ; CHECK-NEXT: vldr s2, [r6, #16]
802 ; CHECK-NEXT: vldr s0, [r7, #16]
803 ; CHECK-NEXT: vcvt.f32.s32 s2, s2
804 ; CHECK-NEXT: vmul.f32 s0, s0, s2
805 ; CHECK-NEXT: vstr s0, [r8, #16]
806 ; CHECK-NEXT: vldr s0, [r3, #16]
807 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
808 ; CHECK-NEXT: vldr s2, [r11, #16]
809 ; CHECK-NEXT: vmul.f32 s0, s2, s0
810 ; CHECK-NEXT: vstr s0, [r9, #16]
811 ; CHECK-NEXT: bne .LBB3_12
812 ; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup
813 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
815 %cmp8 = icmp eq i32 %N, 0
816 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
818 for.body.preheader: ; preds = %entry
819 %min.iters.check = icmp ult i32 %N, 4
820 br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
822 for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
823 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
824 %0 = xor i32 %i.09.ph, -1
826 %xtraiter = and i32 %N, 3
827 %lcmp.mod = icmp eq i32 %xtraiter, 0
828 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
830 for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
831 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
832 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
833 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
834 %2 = load float, float* %arrayidx.prol, align 4
835 %arrayidx1.prol = getelementptr inbounds i32, i32* %b, i32 %i.09.prol
836 %3 = load i32, i32* %arrayidx1.prol, align 4
837 %conv.prol = sitofp i32 %3 to float
838 %mul.prol = fmul float %2, %conv.prol
839 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
840 store float %mul.prol, float* %arrayidx2.prol, align 4
841 %inc.prol = add nuw i32 %i.09.prol, 1
842 %prol.iter.sub = add i32 %prol.iter, -1
843 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
844 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
846 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
847 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
848 %4 = icmp ult i32 %1, 3
849 br i1 %4, label %for.cond.cleanup, label %for.body
851 vector.memcheck: ; preds = %for.body.preheader
852 %scevgep = getelementptr float, float* %c, i32 %N
853 %scevgep13 = getelementptr float, float* %a, i32 %N
854 %bound0 = icmp ugt float* %scevgep13, %c
855 %bound1 = icmp ugt float* %scevgep, %a
856 %found.conflict = and i1 %bound0, %bound1
857 br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
859 vector.ph: ; preds = %vector.memcheck
860 %n.vec = and i32 %N, -4
861 br label %vector.body
863 vector.body: ; preds = %vector.body, %vector.ph
864 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
865 %5 = getelementptr inbounds float, float* %a, i32 %index
866 %6 = bitcast float* %5 to <4 x float>*
867 %wide.load = load <4 x float>, <4 x float>* %6, align 4
868 %7 = getelementptr inbounds i32, i32* %b, i32 %index
869 %8 = bitcast i32* %7 to <4 x i32>*
870 %wide.load15 = load <4 x i32>, <4 x i32>* %8, align 4
871 %9 = sitofp <4 x i32> %wide.load15 to <4 x float>
872 %10 = fmul <4 x float> %wide.load, %9
873 %11 = getelementptr inbounds float, float* %c, i32 %index
874 %12 = bitcast float* %11 to <4 x float>*
875 store <4 x float> %10, <4 x float>* %12, align 4
876 %index.next = add i32 %index, 4
877 %13 = icmp eq i32 %index.next, %n.vec
878 br i1 %13, label %middle.block, label %vector.body
880 middle.block: ; preds = %vector.body
881 %cmp.n = icmp eq i32 %n.vec, %N
882 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
884 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
887 for.body: ; preds = %for.body.prol.loopexit, %for.body
888 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
889 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
890 %14 = load float, float* %arrayidx, align 4
891 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
892 %15 = load i32, i32* %arrayidx1, align 4
893 %conv = sitofp i32 %15 to float
894 %mul = fmul float %14, %conv
895 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
896 store float %mul, float* %arrayidx2, align 4
897 %inc = add nuw i32 %i.09, 1
898 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
899 %16 = load float, float* %arrayidx.1, align 4
900 %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
901 %17 = load i32, i32* %arrayidx1.1, align 4
902 %conv.1 = sitofp i32 %17 to float
903 %mul.1 = fmul float %16, %conv.1
904 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
905 store float %mul.1, float* %arrayidx2.1, align 4
906 %inc.1 = add nuw i32 %i.09, 2
907 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
908 %18 = load float, float* %arrayidx.2, align 4
909 %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
910 %19 = load i32, i32* %arrayidx1.2, align 4
911 %conv.2 = sitofp i32 %19 to float
912 %mul.2 = fmul float %18, %conv.2
913 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
914 store float %mul.2, float* %arrayidx2.2, align 4
915 %inc.2 = add nuw i32 %i.09, 3
916 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
917 %20 = load float, float* %arrayidx.3, align 4
918 %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
919 %21 = load i32, i32* %arrayidx1.3, align 4
920 %conv.3 = sitofp i32 %21 to float
921 %mul.3 = fmul float %20, %conv.3
922 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
923 store float %mul.3, float* %arrayidx2.3, align 4
924 %inc.3 = add nuw i32 %i.09, 4
925 %exitcond.3 = icmp eq i32 %inc.3, %N
926 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
929 define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
930 ; CHECK-LABEL: float_int_int_mul:
931 ; CHECK: @ %bb.0: @ %entry
932 ; CHECK-NEXT: push {r4, r5, r6, lr}
933 ; CHECK-NEXT: cmp r3, #0
934 ; CHECK-NEXT: beq .LBB4_8
935 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
936 ; CHECK-NEXT: cmp r3, #3
937 ; CHECK-NEXT: bhi .LBB4_3
938 ; CHECK-NEXT: @ %bb.2:
939 ; CHECK-NEXT: mov.w r12, #0
940 ; CHECK-NEXT: b .LBB4_6
941 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
942 ; CHECK-NEXT: bic r12, r3, #3
943 ; CHECK-NEXT: movs r5, #1
944 ; CHECK-NEXT: sub.w r6, r12, #4
945 ; CHECK-NEXT: sub.w r4, r0, #16
946 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
947 ; CHECK-NEXT: sub.w r5, r1, #16
948 ; CHECK-NEXT: sub.w r6, r2, #16
949 ; CHECK-NEXT: dls lr, lr
950 ; CHECK-NEXT: .LBB4_4: @ %vector.body
951 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
952 ; CHECK-NEXT: vldrw.u32 q0, [r4, #16]!
953 ; CHECK-NEXT: vldrw.u32 q1, [r5, #16]!
954 ; CHECK-NEXT: vmul.i32 q0, q1, q0
955 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
956 ; CHECK-NEXT: vstrb.8 q0, [r6, #16]!
957 ; CHECK-NEXT: le lr, .LBB4_4
958 ; CHECK-NEXT: @ %bb.5: @ %middle.block
959 ; CHECK-NEXT: cmp r12, r3
961 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
962 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
963 ; CHECK-NEXT: sub.w lr, r3, r12
964 ; CHECK-NEXT: mvn r3, #3
965 ; CHECK-NEXT: add.w r3, r3, r12, lsl #2
966 ; CHECK-NEXT: dls lr, lr
967 ; CHECK-NEXT: add r0, r3
968 ; CHECK-NEXT: add r1, r3
969 ; CHECK-NEXT: add r2, r3
970 ; CHECK-NEXT: .LBB4_7: @ %for.body
971 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
972 ; CHECK-NEXT: ldr r3, [r0, #4]!
973 ; CHECK-NEXT: ldr r6, [r1, #4]!
974 ; CHECK-NEXT: muls r3, r6, r3
975 ; CHECK-NEXT: vmov s0, r3
976 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
977 ; CHECK-NEXT: vstr s0, [r2, #4]
978 ; CHECK-NEXT: adds r2, #4
979 ; CHECK-NEXT: le lr, .LBB4_7
980 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
981 ; CHECK-NEXT: pop {r4, r5, r6, pc}
983 %cmp8 = icmp eq i32 %N, 0
984 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
986 for.body.preheader: ; preds = %entry
987 %min.iters.check = icmp ult i32 %N, 4
988 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
990 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
991 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
994 vector.ph: ; preds = %for.body.preheader
995 %n.vec = and i32 %N, -4
996 br label %vector.body
998 vector.body: ; preds = %vector.body, %vector.ph
999 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1000 %0 = getelementptr inbounds i32, i32* %a, i32 %index
1001 %1 = bitcast i32* %0 to <4 x i32>*
1002 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1003 %2 = getelementptr inbounds i32, i32* %b, i32 %index
1004 %3 = bitcast i32* %2 to <4 x i32>*
1005 %wide.load10 = load <4 x i32>, <4 x i32>* %3, align 4
1006 %4 = mul nsw <4 x i32> %wide.load10, %wide.load
1007 %5 = sitofp <4 x i32> %4 to <4 x float>
1008 %6 = getelementptr inbounds float, float* %c, i32 %index
1009 %7 = bitcast float* %6 to <4 x float>*
1010 store <4 x float> %5, <4 x float>* %7, align 4
1011 %index.next = add i32 %index, 4
1012 %8 = icmp eq i32 %index.next, %n.vec
1013 br i1 %8, label %middle.block, label %vector.body
1015 middle.block: ; preds = %vector.body
1016 %cmp.n = icmp eq i32 %n.vec, %N
1017 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1019 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1022 for.body: ; preds = %for.body.preheader11, %for.body
1023 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1024 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
1025 %9 = load i32, i32* %arrayidx, align 4
1026 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
1027 %10 = load i32, i32* %arrayidx1, align 4
1028 %mul = mul nsw i32 %10, %9
1029 %conv = sitofp i32 %mul to float
1030 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1031 store float %conv, float* %arrayidx2, align 4
1032 %inc = add nuw i32 %i.09, 1
1033 %exitcond = icmp eq i32 %inc, %N
1034 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1037 define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
1038 ; CHECK-LABEL: half_half_mul:
1039 ; CHECK: @ %bb.0: @ %entry
1040 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1041 ; CHECK-NEXT: cmp r3, #0
1042 ; CHECK-NEXT: beq .LBB5_8
1043 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1044 ; CHECK-NEXT: cmp r3, #3
1045 ; CHECK-NEXT: bhi .LBB5_3
1046 ; CHECK-NEXT: @ %bb.2:
1047 ; CHECK-NEXT: mov.w r12, #0
1048 ; CHECK-NEXT: b .LBB5_6
1049 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
1050 ; CHECK-NEXT: bic r12, r3, #3
1051 ; CHECK-NEXT: movs r5, #1
1052 ; CHECK-NEXT: sub.w r6, r12, #4
1053 ; CHECK-NEXT: sub.w r4, r0, #8
1054 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1055 ; CHECK-NEXT: sub.w r5, r1, #8
1056 ; CHECK-NEXT: sub.w r6, r2, #16
1057 ; CHECK-NEXT: dls lr, lr
1058 ; CHECK-NEXT: .LBB5_4: @ %vector.body
1059 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1060 ; CHECK-NEXT: ldr r8, [r5, #8]!
1061 ; CHECK-NEXT: ldr r7, [r4, #8]!
1062 ; CHECK-NEXT: vmov.32 q1[0], r8
1063 ; CHECK-NEXT: vmov.32 q0[0], r7
1064 ; CHECK-NEXT: ldr r7, [r5, #4]
1065 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1066 ; CHECK-NEXT: vmov.32 q1[1], r7
1067 ; CHECK-NEXT: vmov.32 q0[1], r8
1068 ; CHECK-NEXT: vmul.f16 q0, q0, q1
1069 ; CHECK-NEXT: vmovx.f16 s6, s1
1070 ; CHECK-NEXT: vmovx.f16 s4, s0
1071 ; CHECK-NEXT: vcvtb.f32.f16 s11, s6
1072 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1
1073 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4
1074 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0
1075 ; CHECK-NEXT: vstrb.8 q2, [r6, #16]!
1076 ; CHECK-NEXT: le lr, .LBB5_4
1077 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1078 ; CHECK-NEXT: cmp r12, r3
1079 ; CHECK-NEXT: beq .LBB5_8
1080 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
1081 ; CHECK-NEXT: sub.w lr, r3, r12
1082 ; CHECK-NEXT: sub.w r3, r12, #1
1083 ; CHECK-NEXT: dls lr, lr
1084 ; CHECK-NEXT: add.w r0, r0, r3, lsl #1
1085 ; CHECK-NEXT: add.w r1, r1, r3, lsl #1
1086 ; CHECK-NEXT: add.w r2, r2, r3, lsl #2
1087 ; CHECK-NEXT: .LBB5_7: @ %for.body
1088 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1089 ; CHECK-NEXT: vldr.16 s0, [r1, #2]
1090 ; CHECK-NEXT: vldr.16 s2, [r0, #2]
1091 ; CHECK-NEXT: adds r0, #2
1092 ; CHECK-NEXT: adds r1, #2
1093 ; CHECK-NEXT: vmul.f16 s0, s2, s0
1094 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1095 ; CHECK-NEXT: vstr s0, [r2, #4]
1096 ; CHECK-NEXT: adds r2, #4
1097 ; CHECK-NEXT: le lr, .LBB5_7
1098 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
1099 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1101 %cmp8 = icmp eq i32 %N, 0
1102 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1104 for.body.preheader: ; preds = %entry
1105 %min.iters.check = icmp ult i32 %N, 4
1106 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1108 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1109 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1112 vector.ph: ; preds = %for.body.preheader
1113 %n.vec = and i32 %N, -4
1114 br label %vector.body
1116 vector.body: ; preds = %vector.body, %vector.ph
1117 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1118 %0 = getelementptr inbounds half, half* %a, i32 %index
1119 %1 = bitcast half* %0 to <4 x half>*
1120 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1121 %2 = getelementptr inbounds half, half* %b, i32 %index
1122 %3 = bitcast half* %2 to <4 x half>*
1123 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1124 %4 = fmul <4 x half> %wide.load, %wide.load10
1125 %5 = fpext <4 x half> %4 to <4 x float>
1126 %6 = getelementptr inbounds float, float* %c, i32 %index
1127 %7 = bitcast float* %6 to <4 x float>*
1128 store <4 x float> %5, <4 x float>* %7, align 4
1129 %index.next = add i32 %index, 4
1130 %8 = icmp eq i32 %index.next, %n.vec
1131 br i1 %8, label %middle.block, label %vector.body
1133 middle.block: ; preds = %vector.body
1134 %cmp.n = icmp eq i32 %n.vec, %N
1135 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1137 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1140 for.body: ; preds = %for.body.preheader11, %for.body
1141 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1142 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1143 %9 = load half, half* %arrayidx, align 2
1144 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1145 %10 = load half, half* %arrayidx1, align 2
1146 %mul = fmul half %9, %10
1147 %conv = fpext half %mul to float
1148 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1149 store float %conv, float* %arrayidx2, align 4
1150 %inc = add nuw i32 %i.09, 1
1151 %exitcond = icmp eq i32 %inc, %N
1152 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1155 define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
1156 ; CHECK-LABEL: half_half_add:
1157 ; CHECK: @ %bb.0: @ %entry
1158 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1159 ; CHECK-NEXT: cmp r3, #0
1160 ; CHECK-NEXT: beq .LBB6_8
1161 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1162 ; CHECK-NEXT: cmp r3, #3
1163 ; CHECK-NEXT: bhi .LBB6_3
1164 ; CHECK-NEXT: @ %bb.2:
1165 ; CHECK-NEXT: mov.w r12, #0
1166 ; CHECK-NEXT: b .LBB6_6
1167 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1168 ; CHECK-NEXT: bic r12, r3, #3
1169 ; CHECK-NEXT: movs r5, #1
1170 ; CHECK-NEXT: sub.w r6, r12, #4
1171 ; CHECK-NEXT: sub.w r4, r0, #8
1172 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1173 ; CHECK-NEXT: sub.w r5, r1, #8
1174 ; CHECK-NEXT: sub.w r6, r2, #16
1175 ; CHECK-NEXT: dls lr, lr
1176 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1177 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1178 ; CHECK-NEXT: ldr r8, [r5, #8]!
1179 ; CHECK-NEXT: ldr r7, [r4, #8]!
1180 ; CHECK-NEXT: vmov.32 q1[0], r8
1181 ; CHECK-NEXT: vmov.32 q0[0], r7
1182 ; CHECK-NEXT: ldr r7, [r5, #4]
1183 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1184 ; CHECK-NEXT: vmov.32 q1[1], r7
1185 ; CHECK-NEXT: vmov.32 q0[1], r8
1186 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1187 ; CHECK-NEXT: vmovx.f16 s6, s1
1188 ; CHECK-NEXT: vmovx.f16 s4, s0
1189 ; CHECK-NEXT: vcvtb.f32.f16 s11, s6
1190 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1
1191 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4
1192 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0
1193 ; CHECK-NEXT: vstrb.8 q2, [r6, #16]!
1194 ; CHECK-NEXT: le lr, .LBB6_4
1195 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1196 ; CHECK-NEXT: cmp r12, r3
1197 ; CHECK-NEXT: beq .LBB6_8
1198 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
1199 ; CHECK-NEXT: sub.w lr, r3, r12
1200 ; CHECK-NEXT: sub.w r3, r12, #1
1201 ; CHECK-NEXT: dls lr, lr
1202 ; CHECK-NEXT: add.w r0, r0, r3, lsl #1
1203 ; CHECK-NEXT: add.w r1, r1, r3, lsl #1
1204 ; CHECK-NEXT: add.w r2, r2, r3, lsl #2
1205 ; CHECK-NEXT: .LBB6_7: @ %for.body
1206 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1207 ; CHECK-NEXT: vldr.16 s0, [r1, #2]
1208 ; CHECK-NEXT: vldr.16 s2, [r0, #2]
1209 ; CHECK-NEXT: adds r0, #2
1210 ; CHECK-NEXT: adds r1, #2
1211 ; CHECK-NEXT: vadd.f16 s0, s2, s0
1212 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1213 ; CHECK-NEXT: vstr s0, [r2, #4]
1214 ; CHECK-NEXT: adds r2, #4
1215 ; CHECK-NEXT: le lr, .LBB6_7
1216 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1217 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1219 %cmp8 = icmp eq i32 %N, 0
1220 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1222 for.body.preheader: ; preds = %entry
1223 %min.iters.check = icmp ult i32 %N, 4
1224 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1226 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1227 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1230 vector.ph: ; preds = %for.body.preheader
1231 %n.vec = and i32 %N, -4
1232 br label %vector.body
1234 vector.body: ; preds = %vector.body, %vector.ph
1235 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1236 %0 = getelementptr inbounds half, half* %a, i32 %index
1237 %1 = bitcast half* %0 to <4 x half>*
1238 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1239 %2 = getelementptr inbounds half, half* %b, i32 %index
1240 %3 = bitcast half* %2 to <4 x half>*
1241 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1242 %4 = fadd <4 x half> %wide.load, %wide.load10
1243 %5 = fpext <4 x half> %4 to <4 x float>
1244 %6 = getelementptr inbounds float, float* %c, i32 %index
1245 %7 = bitcast float* %6 to <4 x float>*
1246 store <4 x float> %5, <4 x float>* %7, align 4
1247 %index.next = add i32 %index, 4
1248 %8 = icmp eq i32 %index.next, %n.vec
1249 br i1 %8, label %middle.block, label %vector.body
1251 middle.block: ; preds = %vector.body
1252 %cmp.n = icmp eq i32 %n.vec, %N
1253 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1255 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1258 for.body: ; preds = %for.body.preheader11, %for.body
1259 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1260 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1261 %9 = load half, half* %arrayidx, align 2
1262 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1263 %10 = load half, half* %arrayidx1, align 2
1264 %add = fadd half %9, %10
1265 %conv = fpext half %add to float
1266 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1267 store float %conv, float* %arrayidx2, align 4
1268 %inc = add nuw i32 %i.09, 1
1269 %exitcond = icmp eq i32 %inc, %N
1270 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1273 define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
1274 ; CHECK-LABEL: half_half_sub:
1275 ; CHECK: @ %bb.0: @ %entry
1276 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1277 ; CHECK-NEXT: cmp r3, #0
1278 ; CHECK-NEXT: beq .LBB7_8
1279 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1280 ; CHECK-NEXT: cmp r3, #3
1281 ; CHECK-NEXT: bhi .LBB7_3
1282 ; CHECK-NEXT: @ %bb.2:
1283 ; CHECK-NEXT: mov.w r12, #0
1284 ; CHECK-NEXT: b .LBB7_6
1285 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1286 ; CHECK-NEXT: bic r12, r3, #3
1287 ; CHECK-NEXT: movs r5, #1
1288 ; CHECK-NEXT: sub.w r6, r12, #4
1289 ; CHECK-NEXT: sub.w r4, r0, #8
1290 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1291 ; CHECK-NEXT: sub.w r5, r1, #8
1292 ; CHECK-NEXT: sub.w r6, r2, #16
1293 ; CHECK-NEXT: dls lr, lr
1294 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1295 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1296 ; CHECK-NEXT: ldr r8, [r5, #8]!
1297 ; CHECK-NEXT: ldr r7, [r4, #8]!
1298 ; CHECK-NEXT: vmov.32 q1[0], r8
1299 ; CHECK-NEXT: vmov.32 q0[0], r7
1300 ; CHECK-NEXT: ldr r7, [r5, #4]
1301 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1302 ; CHECK-NEXT: vmov.32 q1[1], r7
1303 ; CHECK-NEXT: vmov.32 q0[1], r8
1304 ; CHECK-NEXT: vsub.f16 q0, q0, q1
1305 ; CHECK-NEXT: vmovx.f16 s6, s1
1306 ; CHECK-NEXT: vmovx.f16 s4, s0
1307 ; CHECK-NEXT: vcvtb.f32.f16 s11, s6
1308 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1
1309 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4
1310 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0
1311 ; CHECK-NEXT: vstrb.8 q2, [r6, #16]!
1312 ; CHECK-NEXT: le lr, .LBB7_4
1313 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1314 ; CHECK-NEXT: cmp r12, r3
1315 ; CHECK-NEXT: beq .LBB7_8
1316 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
1317 ; CHECK-NEXT: sub.w lr, r3, r12
1318 ; CHECK-NEXT: sub.w r3, r12, #1
1319 ; CHECK-NEXT: dls lr, lr
1320 ; CHECK-NEXT: add.w r0, r0, r3, lsl #1
1321 ; CHECK-NEXT: add.w r1, r1, r3, lsl #1
1322 ; CHECK-NEXT: add.w r2, r2, r3, lsl #2
1323 ; CHECK-NEXT: .LBB7_7: @ %for.body
1324 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1325 ; CHECK-NEXT: vldr.16 s0, [r1, #2]
1326 ; CHECK-NEXT: vldr.16 s2, [r0, #2]
1327 ; CHECK-NEXT: adds r0, #2
1328 ; CHECK-NEXT: adds r1, #2
1329 ; CHECK-NEXT: vsub.f16 s0, s2, s0
1330 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1331 ; CHECK-NEXT: vstr s0, [r2, #4]
1332 ; CHECK-NEXT: adds r2, #4
1333 ; CHECK-NEXT: le lr, .LBB7_7
1334 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1335 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1337 %cmp8 = icmp eq i32 %N, 0
1338 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1340 for.body.preheader: ; preds = %entry
1341 %min.iters.check = icmp ult i32 %N, 4
1342 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1344 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1345 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1348 vector.ph: ; preds = %for.body.preheader
1349 %n.vec = and i32 %N, -4
1350 br label %vector.body
1352 vector.body: ; preds = %vector.body, %vector.ph
1353 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1354 %0 = getelementptr inbounds half, half* %a, i32 %index
1355 %1 = bitcast half* %0 to <4 x half>*
1356 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1357 %2 = getelementptr inbounds half, half* %b, i32 %index
1358 %3 = bitcast half* %2 to <4 x half>*
1359 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1360 %4 = fsub <4 x half> %wide.load, %wide.load10
1361 %5 = fpext <4 x half> %4 to <4 x float>
1362 %6 = getelementptr inbounds float, float* %c, i32 %index
1363 %7 = bitcast float* %6 to <4 x float>*
1364 store <4 x float> %5, <4 x float>* %7, align 4
1365 %index.next = add i32 %index, 4
1366 %8 = icmp eq i32 %index.next, %n.vec
1367 br i1 %8, label %middle.block, label %vector.body
1369 middle.block: ; preds = %vector.body
1370 %cmp.n = icmp eq i32 %n.vec, %N
1371 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1373 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1376 for.body: ; preds = %for.body.preheader11, %for.body
1377 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1378 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1379 %9 = load half, half* %arrayidx, align 2
1380 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1381 %10 = load half, half* %arrayidx1, align 2
1382 %sub = fsub half %9, %10
1383 %conv = fpext half %sub to float
1384 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1385 store float %conv, float* %arrayidx2, align 4
1386 %inc = add nuw i32 %i.09, 1
1387 %exitcond = icmp eq i32 %inc, %N
1388 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1391 define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) {
1392 ; CHECK-LABEL: half_short_mul:
1393 ; CHECK: @ %bb.0: @ %entry
1394 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1395 ; CHECK-NEXT: cmp r3, #0
1396 ; CHECK-NEXT: beq .LBB8_8
1397 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1398 ; CHECK-NEXT: cmp r3, #3
1399 ; CHECK-NEXT: bhi .LBB8_3
1400 ; CHECK-NEXT: @ %bb.2:
1401 ; CHECK-NEXT: mov.w r12, #0
1402 ; CHECK-NEXT: b .LBB8_6
1403 ; CHECK-NEXT: .LBB8_3: @ %vector.ph
1404 ; CHECK-NEXT: bic r12, r3, #3
1405 ; CHECK-NEXT: movs r5, #1
1406 ; CHECK-NEXT: sub.w r6, r12, #4
1407 ; CHECK-NEXT: sub.w r4, r0, #8
1408 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1409 ; CHECK-NEXT: sub.w r5, r1, #8
1410 ; CHECK-NEXT: sub.w r6, r2, #16
1411 ; CHECK-NEXT: dls lr, lr
1412 ; CHECK-NEXT: .LBB8_4: @ %vector.body
1413 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1414 ; CHECK-NEXT: ldr r8, [r4, #8]!
1415 ; CHECK-NEXT: vldrh.u32 q0, [r5, #8]!
1416 ; CHECK-NEXT: vmov r7, s0
1417 ; CHECK-NEXT: vmov.16 q1[0], r7
1418 ; CHECK-NEXT: vmov r7, s1
1419 ; CHECK-NEXT: vmov.16 q1[1], r7
1420 ; CHECK-NEXT: vmov r7, s2
1421 ; CHECK-NEXT: vmov.16 q1[2], r7
1422 ; CHECK-NEXT: vmov r7, s3
1423 ; CHECK-NEXT: vmov.16 q1[3], r7
1424 ; CHECK-NEXT: ldr r7, [r4, #4]
1425 ; CHECK-NEXT: vcvt.f16.s16 q0, q1
1426 ; CHECK-NEXT: vmov.32 q1[0], r8
1427 ; CHECK-NEXT: vmov.32 q1[1], r7
1428 ; CHECK-NEXT: vmul.f16 q0, q1, q0
1429 ; CHECK-NEXT: vmovx.f16 s6, s1
1430 ; CHECK-NEXT: vmovx.f16 s4, s0
1431 ; CHECK-NEXT: vcvtb.f32.f16 s11, s6
1432 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1
1433 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4
1434 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0
1435 ; CHECK-NEXT: vstrb.8 q2, [r6, #16]!
1436 ; CHECK-NEXT: le lr, .LBB8_4
1437 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1438 ; CHECK-NEXT: cmp r12, r3
1439 ; CHECK-NEXT: beq .LBB8_8
1440 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
1441 ; CHECK-NEXT: sub.w lr, r3, r12
1442 ; CHECK-NEXT: sub.w r3, r12, #1
1443 ; CHECK-NEXT: dls lr, lr
1444 ; CHECK-NEXT: add.w r0, r0, r3, lsl #1
1445 ; CHECK-NEXT: add.w r1, r1, r3, lsl #1
1446 ; CHECK-NEXT: add.w r2, r2, r3, lsl #2
1447 ; CHECK-NEXT: .LBB8_7: @ %for.body
1448 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1449 ; CHECK-NEXT: ldrsh r3, [r1, #2]!
1450 ; CHECK-NEXT: vldr.16 s0, [r0, #2]
1451 ; CHECK-NEXT: adds r0, #2
1452 ; CHECK-NEXT: vmov s2, r3
1453 ; CHECK-NEXT: vcvt.f16.s32 s2, s2
1454 ; CHECK-NEXT: vmul.f16 s0, s0, s2
1455 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1456 ; CHECK-NEXT: vstr s0, [r2, #4]
1457 ; CHECK-NEXT: adds r2, #4
1458 ; CHECK-NEXT: le lr, .LBB8_7
1459 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
1460 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1462 %cmp10 = icmp eq i32 %N, 0
1463 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1465 for.body.preheader: ; preds = %entry
1466 %min.iters.check = icmp ult i32 %N, 4
1467 br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
1469 for.body.preheader13: ; preds = %middle.block, %for.body.preheader
1470 %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1473 vector.ph: ; preds = %for.body.preheader
1474 %n.vec = and i32 %N, -4
1475 br label %vector.body
1477 vector.body: ; preds = %vector.body, %vector.ph
1478 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1479 %0 = getelementptr inbounds half, half* %a, i32 %index
1480 %1 = bitcast half* %0 to <4 x half>*
1481 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1482 %2 = getelementptr inbounds i16, i16* %b, i32 %index
1483 %3 = bitcast i16* %2 to <4 x i16>*
1484 %wide.load12 = load <4 x i16>, <4 x i16>* %3, align 2
1485 %4 = sitofp <4 x i16> %wide.load12 to <4 x half>
1486 %5 = fmul <4 x half> %wide.load, %4
1487 %6 = fpext <4 x half> %5 to <4 x float>
1488 %7 = getelementptr inbounds float, float* %c, i32 %index
1489 %8 = bitcast float* %7 to <4 x float>*
1490 store <4 x float> %6, <4 x float>* %8, align 4
1491 %index.next = add i32 %index, 4
1492 %9 = icmp eq i32 %index.next, %n.vec
1493 br i1 %9, label %middle.block, label %vector.body
1495 middle.block: ; preds = %vector.body
1496 %cmp.n = icmp eq i32 %n.vec, %N
1497 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
1499 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1502 for.body: ; preds = %for.body.preheader13, %for.body
1503 %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
1504 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
1505 %10 = load half, half* %arrayidx, align 2
1506 %arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.011
1507 %11 = load i16, i16* %arrayidx1, align 2
1508 %conv2 = sitofp i16 %11 to half
1509 %mul = fmul half %10, %conv2
1510 %conv3 = fpext half %mul to float
1511 %arrayidx4 = getelementptr inbounds float, float* %c, i32 %i.011
1512 store float %conv3, float* %arrayidx4, align 4
1513 %inc = add nuw i32 %i.011, 1
1514 %exitcond = icmp eq i32 %inc, %N
1515 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1518 define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
1519 ; CHECK-LABEL: half_half_mac:
1520 ; CHECK: @ %bb.0: @ %entry
1521 ; CHECK-NEXT: push {r4, r5, r7, lr}
1522 ; CHECK-NEXT: cbz r2, .LBB9_3
1523 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1524 ; CHECK-NEXT: subs r3, r2, #1
1525 ; CHECK-NEXT: and r4, r2, #3
1526 ; CHECK-NEXT: cmp r3, #3
1527 ; CHECK-NEXT: bhs .LBB9_4
1528 ; CHECK-NEXT: @ %bb.2:
1529 ; CHECK-NEXT: vldr s0, .LCPI9_0
1530 ; CHECK-NEXT: mov r5, r4
1531 ; CHECK-NEXT: mov.w r12, #0
1532 ; CHECK-NEXT: b .LBB9_6
1533 ; CHECK-NEXT: .LBB9_3:
1534 ; CHECK-NEXT: vldr s0, .LCPI9_0
1535 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1536 ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
1537 ; CHECK-NEXT: subs r2, r2, r4
1538 ; CHECK-NEXT: movs r3, #1
1539 ; CHECK-NEXT: subs r2, #4
1540 ; CHECK-NEXT: vldr s0, .LCPI9_0
1541 ; CHECK-NEXT: mov r5, r4
1542 ; CHECK-NEXT: mov.w r12, #0
1543 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1544 ; CHECK-NEXT: sub.w r3, r0, #8
1545 ; CHECK-NEXT: sub.w r2, r1, #8
1546 ; CHECK-NEXT: dls lr, lr
1547 ; CHECK-NEXT: .LBB9_5: @ %for.body
1548 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1549 ; CHECK-NEXT: vldr.16 s2, [r2, #14]
1550 ; CHECK-NEXT: vldr.16 s4, [r3, #14]
1551 ; CHECK-NEXT: vldr.16 s6, [r3, #12]
1552 ; CHECK-NEXT: vldr.16 s8, [r3, #10]
1553 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1554 ; CHECK-NEXT: vldr.16 s4, [r2, #12]
1555 ; CHECK-NEXT: vldr.16 s10, [r3, #8]
1556 ; CHECK-NEXT: adds r3, #8
1557 ; CHECK-NEXT: vmul.f16 s4, s6, s4
1558 ; CHECK-NEXT: vldr.16 s6, [r2, #10]
1559 ; CHECK-NEXT: add.w r12, r12, #4
1560 ; CHECK-NEXT: vmul.f16 s6, s8, s6
1561 ; CHECK-NEXT: vldr.16 s8, [r2, #8]
1562 ; CHECK-NEXT: adds r2, #8
1563 ; CHECK-NEXT: vmul.f16 s8, s10, s8
1564 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1565 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1566 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1567 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1568 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1569 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1570 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1571 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1572 ; CHECK-NEXT: le lr, .LBB9_5
1573 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1574 ; CHECK-NEXT: wls lr, r5, .LBB9_9
1575 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1576 ; CHECK-NEXT: mvn r2, #1
1577 ; CHECK-NEXT: mov lr, r5
1578 ; CHECK-NEXT: add.w r2, r2, r12, lsl #1
1579 ; CHECK-NEXT: add r0, r2
1580 ; CHECK-NEXT: add r1, r2
1581 ; CHECK-NEXT: .LBB9_8: @ %for.body.epil
1582 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1583 ; CHECK-NEXT: vldr.16 s2, [r1, #2]
1584 ; CHECK-NEXT: vldr.16 s4, [r0, #2]
1585 ; CHECK-NEXT: adds r0, #2
1586 ; CHECK-NEXT: adds r1, #2
1587 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1588 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1589 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1590 ; CHECK-NEXT: le lr, .LBB9_8
1591 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
1592 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1593 ; CHECK-NEXT: .p2align 2
1594 ; CHECK-NEXT: @ %bb.10:
1595 ; CHECK-NEXT: .LCPI9_0:
1596 ; CHECK-NEXT: .long 0 @ float 0
1598 %cmp8 = icmp eq i32 %N, 0
1599 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1601 for.body.preheader: ; preds = %entry
1603 %xtraiter = and i32 %N, 3
1604 %1 = icmp ult i32 %0, 3
1605 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1607 for.body.preheader.new: ; preds = %for.body.preheader
1608 %unroll_iter = sub i32 %N, %xtraiter
1611 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1612 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1613 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1614 %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1615 %lcmp.mod = icmp eq i32 %xtraiter, 0
1616 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1618 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1619 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1620 %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1621 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1622 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.010.epil
1623 %2 = load half, half* %arrayidx.epil, align 2
1624 %arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.010.epil
1625 %3 = load half, half* %arrayidx1.epil, align 2
1626 %mul.epil = fmul half %2, %3
1627 %conv.epil = fpext half %mul.epil to float
1628 %add.epil = fadd float %res.09.epil, %conv.epil
1629 %inc.epil = add nuw i32 %i.010.epil, 1
1630 %epil.iter.sub = add i32 %epil.iter, -1
1631 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1632 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1634 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1635 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1636 ret float %res.0.lcssa
1638 for.body: ; preds = %for.body, %for.body.preheader.new
1639 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1640 %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1641 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1642 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.010
1643 %4 = load half, half* %arrayidx, align 2
1644 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.010
1645 %5 = load half, half* %arrayidx1, align 2
1646 %mul = fmul half %4, %5
1647 %conv = fpext half %mul to float
1648 %add = fadd float %res.09, %conv
1649 %inc = or i32 %i.010, 1
1650 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1651 %6 = load half, half* %arrayidx.1, align 2
1652 %arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
1653 %7 = load half, half* %arrayidx1.1, align 2
1654 %mul.1 = fmul half %6, %7
1655 %conv.1 = fpext half %mul.1 to float
1656 %add.1 = fadd float %add, %conv.1
1657 %inc.1 = or i32 %i.010, 2
1658 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1659 %8 = load half, half* %arrayidx.2, align 2
1660 %arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
1661 %9 = load half, half* %arrayidx1.2, align 2
1662 %mul.2 = fmul half %8, %9
1663 %conv.2 = fpext half %mul.2 to float
1664 %add.2 = fadd float %add.1, %conv.2
1665 %inc.2 = or i32 %i.010, 3
1666 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
1667 %10 = load half, half* %arrayidx.3, align 2
1668 %arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
1669 %11 = load half, half* %arrayidx1.3, align 2
1670 %mul.3 = fmul half %10, %11
1671 %conv.3 = fpext half %mul.3 to float
1672 %add.3 = fadd float %add.2, %conv.3
1673 %inc.3 = add nuw i32 %i.010, 4
1674 %niter.nsub.3 = add i32 %niter, -4
1675 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1676 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1679 define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
1680 ; CHECK-LABEL: half_half_acc:
1681 ; CHECK: @ %bb.0: @ %entry
1682 ; CHECK-NEXT: push {r4, r5, r7, lr}
1683 ; CHECK-NEXT: cbz r2, .LBB10_3
1684 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1685 ; CHECK-NEXT: subs r3, r2, #1
1686 ; CHECK-NEXT: and r4, r2, #3
1687 ; CHECK-NEXT: cmp r3, #3
1688 ; CHECK-NEXT: bhs .LBB10_4
1689 ; CHECK-NEXT: @ %bb.2:
1690 ; CHECK-NEXT: vldr s0, .LCPI10_0
1691 ; CHECK-NEXT: mov r5, r4
1692 ; CHECK-NEXT: mov.w r12, #0
1693 ; CHECK-NEXT: b .LBB10_6
1694 ; CHECK-NEXT: .LBB10_3:
1695 ; CHECK-NEXT: vldr s0, .LCPI10_0
1696 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1697 ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
1698 ; CHECK-NEXT: subs r2, r2, r4
1699 ; CHECK-NEXT: movs r3, #1
1700 ; CHECK-NEXT: subs r2, #4
1701 ; CHECK-NEXT: vldr s0, .LCPI10_0
1702 ; CHECK-NEXT: mov r5, r4
1703 ; CHECK-NEXT: mov.w r12, #0
1704 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1705 ; CHECK-NEXT: sub.w r3, r0, #8
1706 ; CHECK-NEXT: sub.w r2, r1, #8
1707 ; CHECK-NEXT: dls lr, lr
1708 ; CHECK-NEXT: .LBB10_5: @ %for.body
1709 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1710 ; CHECK-NEXT: vldr.16 s2, [r2, #14]
1711 ; CHECK-NEXT: vldr.16 s4, [r3, #14]
1712 ; CHECK-NEXT: vldr.16 s6, [r3, #12]
1713 ; CHECK-NEXT: vldr.16 s8, [r3, #10]
1714 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1715 ; CHECK-NEXT: vldr.16 s4, [r2, #12]
1716 ; CHECK-NEXT: vldr.16 s10, [r3, #8]
1717 ; CHECK-NEXT: adds r3, #8
1718 ; CHECK-NEXT: vadd.f16 s4, s6, s4
1719 ; CHECK-NEXT: vldr.16 s6, [r2, #10]
1720 ; CHECK-NEXT: add.w r12, r12, #4
1721 ; CHECK-NEXT: vadd.f16 s6, s8, s6
1722 ; CHECK-NEXT: vldr.16 s8, [r2, #8]
1723 ; CHECK-NEXT: adds r2, #8
1724 ; CHECK-NEXT: vadd.f16 s8, s10, s8
1725 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1726 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1727 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1728 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1729 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1730 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1731 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1732 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1733 ; CHECK-NEXT: le lr, .LBB10_5
1734 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1735 ; CHECK-NEXT: wls lr, r5, .LBB10_9
1736 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1737 ; CHECK-NEXT: mvn r2, #1
1738 ; CHECK-NEXT: mov lr, r5
1739 ; CHECK-NEXT: add.w r2, r2, r12, lsl #1
1740 ; CHECK-NEXT: add r0, r2
1741 ; CHECK-NEXT: add r1, r2
1742 ; CHECK-NEXT: .LBB10_8: @ %for.body.epil
1743 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1744 ; CHECK-NEXT: vldr.16 s2, [r1, #2]
1745 ; CHECK-NEXT: vldr.16 s4, [r0, #2]
1746 ; CHECK-NEXT: adds r0, #2
1747 ; CHECK-NEXT: adds r1, #2
1748 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1749 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1750 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1751 ; CHECK-NEXT: le lr, .LBB10_8
1752 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
1753 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1754 ; CHECK-NEXT: .p2align 2
1755 ; CHECK-NEXT: @ %bb.10:
1756 ; CHECK-NEXT: .LCPI10_0:
1757 ; CHECK-NEXT: .long 0 @ float 0
1759 %cmp9 = icmp eq i32 %N, 0
1760 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
1762 for.body.preheader: ; preds = %entry
1764 %xtraiter = and i32 %N, 3
1765 %1 = icmp ult i32 %0, 3
1766 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1768 for.body.preheader.new: ; preds = %for.body.preheader
1769 %unroll_iter = sub i32 %N, %xtraiter
1772 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1773 %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
1774 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1775 %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
1776 %lcmp.mod = icmp eq i32 %xtraiter, 0
1777 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1779 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1780 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1781 %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1782 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1783 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.011.epil
1784 %2 = load half, half* %arrayidx.epil, align 2
1785 %arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.011.epil
1786 %3 = load half, half* %arrayidx1.epil, align 2
1787 %add.epil = fadd half %2, %3
1788 %conv.epil = fpext half %add.epil to float
1789 %add2.epil = fadd float %res.010.epil, %conv.epil
1790 %inc.epil = add nuw i32 %i.011.epil, 1
1791 %epil.iter.sub = add i32 %epil.iter, -1
1792 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1793 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1795 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1796 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
1797 ret float %res.0.lcssa
1799 for.body: ; preds = %for.body, %for.body.preheader.new
1800 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1801 %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
1802 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1803 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
1804 %4 = load half, half* %arrayidx, align 2
1805 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.011
1806 %5 = load half, half* %arrayidx1, align 2
1807 %add = fadd half %4, %5
1808 %conv = fpext half %add to float
1809 %add2 = fadd float %res.010, %conv
1810 %inc = or i32 %i.011, 1
1811 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1812 %6 = load half, half* %arrayidx.1, align 2
1813 %arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
1814 %7 = load half, half* %arrayidx1.1, align 2
1815 %add.1 = fadd half %6, %7
1816 %conv.1 = fpext half %add.1 to float
1817 %add2.1 = fadd float %add2, %conv.1
1818 %inc.1 = or i32 %i.011, 2
1819 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1820 %8 = load half, half* %arrayidx.2, align 2
1821 %arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
1822 %9 = load half, half* %arrayidx1.2, align 2
1823 %add.2 = fadd half %8, %9
1824 %conv.2 = fpext half %add.2 to float
1825 %add2.2 = fadd float %add2.1, %conv.2
1826 %inc.2 = or i32 %i.011, 3
1827 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
1828 %10 = load half, half* %arrayidx.3, align 2
1829 %arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
1830 %11 = load half, half* %arrayidx1.3, align 2
1831 %add.3 = fadd half %10, %11
1832 %conv.3 = fpext half %add.3 to float
1833 %add2.3 = fadd float %add2.2, %conv.3
1834 %inc.3 = add nuw i32 %i.011, 4
1835 %niter.nsub.3 = add i32 %niter, -4
1836 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1837 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1840 define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
1841 ; CHECK-LABEL: half_short_mac:
1842 ; CHECK: @ %bb.0: @ %entry
1843 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
1844 ; CHECK-NEXT: cbz r2, .LBB11_3
1845 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1846 ; CHECK-NEXT: subs r3, r2, #1
1847 ; CHECK-NEXT: and r7, r2, #3
1848 ; CHECK-NEXT: cmp r3, #3
1849 ; CHECK-NEXT: bhs .LBB11_4
1850 ; CHECK-NEXT: @ %bb.2:
1851 ; CHECK-NEXT: vldr s0, .LCPI11_0
1852 ; CHECK-NEXT: mov r8, r7
1853 ; CHECK-NEXT: mov.w r12, #0
1854 ; CHECK-NEXT: b .LBB11_6
1855 ; CHECK-NEXT: .LBB11_3:
1856 ; CHECK-NEXT: vldr s0, .LCPI11_0
1857 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1858 ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
1859 ; CHECK-NEXT: subs r2, r2, r7
1860 ; CHECK-NEXT: movs r3, #1
1861 ; CHECK-NEXT: subs r2, #4
1862 ; CHECK-NEXT: vldr s0, .LCPI11_0
1863 ; CHECK-NEXT: mov r8, r7
1864 ; CHECK-NEXT: mov.w r12, #0
1865 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1866 ; CHECK-NEXT: movs r3, #0
1867 ; CHECK-NEXT: dls lr, lr
1868 ; CHECK-NEXT: .LBB11_5: @ %for.body
1869 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1870 ; CHECK-NEXT: adds r2, r1, r3
1871 ; CHECK-NEXT: adds r6, r0, r3
1872 ; CHECK-NEXT: vldr.16 s2, [r6, #6]
1873 ; CHECK-NEXT: add.w r12, r12, #4
1874 ; CHECK-NEXT: ldrsh.w r4, [r2, #2]
1875 ; CHECK-NEXT: ldrsh.w r5, [r2, #4]
1876 ; CHECK-NEXT: ldrsh.w r2, [r2, #6]
1877 ; CHECK-NEXT: vmov s8, r4
1878 ; CHECK-NEXT: vmov s6, r5
1879 ; CHECK-NEXT: vmov s4, r2
1880 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1881 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1882 ; CHECK-NEXT: vldr.16 s4, [r6, #4]
1883 ; CHECK-NEXT: vcvt.f16.s32 s6, s6
1884 ; CHECK-NEXT: vmul.f16 s4, s4, s6
1885 ; CHECK-NEXT: vldr.16 s6, [r6, #2]
1886 ; CHECK-NEXT: vcvt.f16.s32 s8, s8
1887 ; CHECK-NEXT: ldrsh r2, [r1, r3]
1888 ; CHECK-NEXT: vmul.f16 s6, s6, s8
1889 ; CHECK-NEXT: vldr.16 s8, [r6]
1890 ; CHECK-NEXT: adds r3, #8
1891 ; CHECK-NEXT: vmov s10, r2
1892 ; CHECK-NEXT: vcvt.f16.s32 s10, s10
1893 ; CHECK-NEXT: vmul.f16 s8, s8, s10
1894 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1895 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1896 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1897 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1898 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1899 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1900 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1901 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1902 ; CHECK-NEXT: le lr, .LBB11_5
1903 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1904 ; CHECK-NEXT: wls lr, r8, .LBB11_9
1905 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1906 ; CHECK-NEXT: mvn r3, #1
1907 ; CHECK-NEXT: add.w r2, r3, r12, lsl #1
1908 ; CHECK-NEXT: add r0, r2
1909 ; CHECK-NEXT: add r1, r2
1910 ; CHECK-NEXT: mov lr, r8
1911 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil
1912 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1913 ; CHECK-NEXT: ldrsh r2, [r1, #2]!
1914 ; CHECK-NEXT: vldr.16 s2, [r0, #2]
1915 ; CHECK-NEXT: adds r0, #2
1916 ; CHECK-NEXT: vmov s4, r2
1917 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1918 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1919 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1920 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1921 ; CHECK-NEXT: le lr, .LBB11_8
1922 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1923 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
1924 ; CHECK-NEXT: .p2align 2
1925 ; CHECK-NEXT: @ %bb.10:
1926 ; CHECK-NEXT: .LCPI11_0:
1927 ; CHECK-NEXT: .long 0 @ float 0
1929 %cmp10 = icmp eq i32 %N, 0
1930 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1932 for.body.preheader: ; preds = %entry
1934 %xtraiter = and i32 %N, 3
1935 %1 = icmp ult i32 %0, 3
1936 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1938 for.body.preheader.new: ; preds = %for.body.preheader
1939 %unroll_iter = sub i32 %N, %xtraiter
1942 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1943 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1944 %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1945 %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1946 %lcmp.mod = icmp eq i32 %xtraiter, 0
1947 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1949 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1950 %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1951 %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1952 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1953 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.012.epil
1954 %2 = load half, half* %arrayidx.epil, align 2
1955 %arrayidx1.epil = getelementptr inbounds i16, i16* %b, i32 %i.012.epil
1956 %3 = load i16, i16* %arrayidx1.epil, align 2
1957 %conv2.epil = sitofp i16 %3 to half
1958 %mul.epil = fmul half %2, %conv2.epil
1959 %conv3.epil = fpext half %mul.epil to float
1960 %add.epil = fadd float %res.011.epil, %conv3.epil
1961 %inc.epil = add nuw i32 %i.012.epil, 1
1962 %epil.iter.sub = add i32 %epil.iter, -1
1963 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1964 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1966 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1967 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1968 ret float %res.0.lcssa
1970 for.body: ; preds = %for.body, %for.body.preheader.new
1971 %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1972 %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1973 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1974 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.012
1975 %4 = load half, half* %arrayidx, align 2
1976 %arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.012
1977 %5 = load i16, i16* %arrayidx1, align 2
1978 %conv2 = sitofp i16 %5 to half
1979 %mul = fmul half %4, %conv2
1980 %conv3 = fpext half %mul to float
1981 %add = fadd float %res.011, %conv3
1982 %inc = or i32 %i.012, 1
1983 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1984 %6 = load half, half* %arrayidx.1, align 2
1985 %arrayidx1.1 = getelementptr inbounds i16, i16* %b, i32 %inc
1986 %7 = load i16, i16* %arrayidx1.1, align 2
1987 %conv2.1 = sitofp i16 %7 to half
1988 %mul.1 = fmul half %6, %conv2.1
1989 %conv3.1 = fpext half %mul.1 to float
1990 %add.1 = fadd float %add, %conv3.1
1991 %inc.1 = or i32 %i.012, 2
1992 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1993 %8 = load half, half* %arrayidx.2, align 2
1994 %arrayidx1.2 = getelementptr inbounds i16, i16* %b, i32 %inc.1
1995 %9 = load i16, i16* %arrayidx1.2, align 2
1996 %conv2.2 = sitofp i16 %9 to half
1997 %mul.2 = fmul half %8, %conv2.2
1998 %conv3.2 = fpext half %mul.2 to float
1999 %add.2 = fadd float %add.1, %conv3.2
2000 %inc.2 = or i32 %i.012, 3
2001 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
2002 %10 = load half, half* %arrayidx.3, align 2
2003 %arrayidx1.3 = getelementptr inbounds i16, i16* %b, i32 %inc.2
2004 %11 = load i16, i16* %arrayidx1.3, align 2
2005 %conv2.3 = sitofp i16 %11 to half
2006 %mul.3 = fmul half %10, %conv2.3
2007 %conv3.3 = fpext half %mul.3 to float
2008 %add.3 = fadd float %add.2, %conv3.3
2009 %inc.3 = add nuw i32 %i.012, 4
2010 %niter.nsub.3 = add i32 %niter, -4
2011 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
2012 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body