1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
5 ; CHECK-LABEL: float_float_mul:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: cmp r3, #0
10 ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader
11 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
12 ; CHECK-NEXT: cmp r3, #3
13 ; CHECK-NEXT: bhi .LBB0_3
14 ; CHECK-NEXT: @ %bb.2:
15 ; CHECK-NEXT: mov.w r12, #0
16 ; CHECK-NEXT: b .LBB0_4
17 ; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
18 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
19 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
20 ; CHECK-NEXT: cmp r7, r2
21 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
22 ; CHECK-NEXT: cset r7, hi
23 ; CHECK-NEXT: cmp r6, r1
24 ; CHECK-NEXT: csel r7, zr, r7, ls
25 ; CHECK-NEXT: cmp r6, r0
26 ; CHECK-NEXT: cset r6, hi
27 ; CHECK-NEXT: cmp r5, r2
28 ; CHECK-NEXT: cset r5, hi
29 ; CHECK-NEXT: mov.w r12, #0
30 ; CHECK-NEXT: tst r5, r6
32 ; CHECK-NEXT: cmpeq r7, #0
33 ; CHECK-NEXT: beq .LBB0_11
34 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
35 ; CHECK-NEXT: mvn.w r7, r12
36 ; CHECK-NEXT: add.w r8, r7, r3
37 ; CHECK-NEXT: and r5, r3, #3
38 ; CHECK-NEXT: wls lr, r5, .LBB0_7
39 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
40 ; CHECK-NEXT: add.w r4, r12, r5
41 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
42 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
43 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
44 ; CHECK-NEXT: mov r12, r4
45 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol
46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
47 ; CHECK-NEXT: vldmia r6!, {s0}
48 ; CHECK-NEXT: vldmia r5!, {s2}
49 ; CHECK-NEXT: vmul.f32 s0, s2, s0
50 ; CHECK-NEXT: vstmia r7!, {s0}
51 ; CHECK-NEXT: le lr, .LBB0_6
52 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
53 ; CHECK-NEXT: cmp.w r8, #3
54 ; CHECK-NEXT: blo .LBB0_10
55 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
56 ; CHECK-NEXT: sub.w r3, r3, r12
57 ; CHECK-NEXT: lsl.w r12, r12, #2
58 ; CHECK-NEXT: .LBB0_9: @ %for.body
59 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
60 ; CHECK-NEXT: add.w r7, r1, r12
61 ; CHECK-NEXT: add.w r6, r0, r12
62 ; CHECK-NEXT: add.w r5, r2, r12
63 ; CHECK-NEXT: adds r0, #16
64 ; CHECK-NEXT: vldr s0, [r7]
65 ; CHECK-NEXT: adds r1, #16
66 ; CHECK-NEXT: vldr s2, [r6]
67 ; CHECK-NEXT: adds r2, #16
68 ; CHECK-NEXT: subs r3, #4
69 ; CHECK-NEXT: vmul.f32 s0, s2, s0
70 ; CHECK-NEXT: vstr s0, [r5]
71 ; CHECK-NEXT: vldr s0, [r7, #4]
72 ; CHECK-NEXT: vldr s2, [r6, #4]
73 ; CHECK-NEXT: vmul.f32 s0, s2, s0
74 ; CHECK-NEXT: vstr s0, [r5, #4]
75 ; CHECK-NEXT: vldr s0, [r7, #8]
76 ; CHECK-NEXT: vldr s2, [r6, #8]
77 ; CHECK-NEXT: vmul.f32 s0, s2, s0
78 ; CHECK-NEXT: vstr s0, [r5, #8]
79 ; CHECK-NEXT: vldr s0, [r7, #12]
80 ; CHECK-NEXT: vldr s2, [r6, #12]
81 ; CHECK-NEXT: vmul.f32 s0, s2, s0
82 ; CHECK-NEXT: vstr s0, [r5, #12]
83 ; CHECK-NEXT: bne .LBB0_9
84 ; CHECK-NEXT: .LBB0_10:
85 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
87 ; CHECK-NEXT: .LBB0_11: @ %vector.ph
88 ; CHECK-NEXT: bic r12, r3, #3
89 ; CHECK-NEXT: movs r6, #1
90 ; CHECK-NEXT: sub.w r7, r12, #4
91 ; CHECK-NEXT: mov r4, r0
92 ; CHECK-NEXT: mov r5, r1
93 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
94 ; CHECK-NEXT: mov r6, r2
95 ; CHECK-NEXT: .LBB0_12: @ %vector.body
96 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
97 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
98 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
99 ; CHECK-NEXT: vmul.f32 q0, q1, q0
100 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
101 ; CHECK-NEXT: le lr, .LBB0_12
102 ; CHECK-NEXT: @ %bb.13: @ %middle.block
103 ; CHECK-NEXT: cmp r12, r3
104 ; CHECK-NEXT: bne .LBB0_4
105 ; CHECK-NEXT: b .LBB0_10
107 %cmp8 = icmp eq i32 %N, 0
108 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
110 for.body.preheader: ; preds = %entry
111 %min.iters.check = icmp ult i32 %N, 4
112 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
114 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
115 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
116 %0 = xor i32 %i.09.ph, -1
118 %xtraiter = and i32 %N, 3
119 %lcmp.mod = icmp eq i32 %xtraiter, 0
120 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
122 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
123 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
124 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
125 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
126 %2 = load float, ptr %arrayidx.prol, align 4
127 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
128 %3 = load float, ptr %arrayidx1.prol, align 4
129 %mul.prol = fmul float %2, %3
130 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
131 store float %mul.prol, ptr %arrayidx2.prol, align 4
132 %inc.prol = add nuw i32 %i.09.prol, 1
133 %prol.iter.sub = add i32 %prol.iter, -1
134 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
135 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
137 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
138 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
139 %4 = icmp ult i32 %1, 3
140 br i1 %4, label %for.cond.cleanup, label %for.body
142 vector.memcheck: ; preds = %for.body.preheader
143 %scevgep = getelementptr float, ptr %c, i32 %N
144 %scevgep13 = getelementptr float, ptr %a, i32 %N
145 %scevgep16 = getelementptr float, ptr %b, i32 %N
146 %bound0 = icmp ugt ptr %scevgep13, %c
147 %bound1 = icmp ugt ptr %scevgep, %a
148 %found.conflict = and i1 %bound0, %bound1
149 %bound018 = icmp ugt ptr %scevgep16, %c
150 %bound119 = icmp ugt ptr %scevgep, %b
151 %found.conflict20 = and i1 %bound018, %bound119
152 %conflict.rdx = or i1 %found.conflict, %found.conflict20
153 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
155 vector.ph: ; preds = %vector.memcheck
156 %n.vec = and i32 %N, -4
157 br label %vector.body
159 vector.body: ; preds = %vector.body, %vector.ph
160 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
161 %5 = getelementptr inbounds float, ptr %a, i32 %index
162 %wide.load = load <4 x float>, ptr %5, align 4
163 %6 = getelementptr inbounds float, ptr %b, i32 %index
164 %wide.load21 = load <4 x float>, ptr %6, align 4
165 %7 = fmul <4 x float> %wide.load, %wide.load21
166 %8 = getelementptr inbounds float, ptr %c, i32 %index
167 store <4 x float> %7, ptr %8, align 4
168 %index.next = add i32 %index, 4
169 %9 = icmp eq i32 %index.next, %n.vec
170 br i1 %9, label %middle.block, label %vector.body
172 middle.block: ; preds = %vector.body
173 %cmp.n = icmp eq i32 %n.vec, %N
174 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
176 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
179 for.body: ; preds = %for.body.prol.loopexit, %for.body
180 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
181 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
182 %10 = load float, ptr %arrayidx, align 4
183 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
184 %11 = load float, ptr %arrayidx1, align 4
185 %mul = fmul float %10, %11
186 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
187 store float %mul, ptr %arrayidx2, align 4
188 %inc = add nuw i32 %i.09, 1
189 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
190 %12 = load float, ptr %arrayidx.1, align 4
191 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
192 %13 = load float, ptr %arrayidx1.1, align 4
193 %mul.1 = fmul float %12, %13
194 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
195 store float %mul.1, ptr %arrayidx2.1, align 4
196 %inc.1 = add nuw i32 %i.09, 2
197 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
198 %14 = load float, ptr %arrayidx.2, align 4
199 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
200 %15 = load float, ptr %arrayidx1.2, align 4
201 %mul.2 = fmul float %14, %15
202 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
203 store float %mul.2, ptr %arrayidx2.2, align 4
204 %inc.2 = add nuw i32 %i.09, 3
205 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
206 %16 = load float, ptr %arrayidx.3, align 4
207 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
208 %17 = load float, ptr %arrayidx1.3, align 4
209 %mul.3 = fmul float %16, %17
210 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
211 store float %mul.3, ptr %arrayidx2.3, align 4
212 %inc.3 = add nuw i32 %i.09, 4
213 %exitcond.3 = icmp eq i32 %inc.3, %N
214 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
217 define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
218 ; CHECK-LABEL: float_float_add:
219 ; CHECK: @ %bb.0: @ %entry
220 ; CHECK-NEXT: cmp r3, #0
222 ; CHECK-NEXT: bxeq lr
223 ; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
224 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
225 ; CHECK-NEXT: cmp r3, #3
226 ; CHECK-NEXT: bhi .LBB1_3
227 ; CHECK-NEXT: @ %bb.2:
228 ; CHECK-NEXT: mov.w r12, #0
229 ; CHECK-NEXT: b .LBB1_4
230 ; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
231 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
232 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
233 ; CHECK-NEXT: cmp r7, r2
234 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
235 ; CHECK-NEXT: cset r7, hi
236 ; CHECK-NEXT: cmp r6, r1
237 ; CHECK-NEXT: csel r7, zr, r7, ls
238 ; CHECK-NEXT: cmp r6, r0
239 ; CHECK-NEXT: cset r6, hi
240 ; CHECK-NEXT: cmp r5, r2
241 ; CHECK-NEXT: cset r5, hi
242 ; CHECK-NEXT: mov.w r12, #0
243 ; CHECK-NEXT: tst r5, r6
245 ; CHECK-NEXT: cmpeq r7, #0
246 ; CHECK-NEXT: beq .LBB1_11
247 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
248 ; CHECK-NEXT: mvn.w r7, r12
249 ; CHECK-NEXT: add.w r8, r7, r3
250 ; CHECK-NEXT: and r5, r3, #3
251 ; CHECK-NEXT: wls lr, r5, .LBB1_7
252 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
253 ; CHECK-NEXT: add.w r4, r12, r5
254 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
255 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
256 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
257 ; CHECK-NEXT: mov r12, r4
258 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol
259 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
260 ; CHECK-NEXT: vldmia r6!, {s0}
261 ; CHECK-NEXT: vldmia r5!, {s2}
262 ; CHECK-NEXT: vadd.f32 s0, s2, s0
263 ; CHECK-NEXT: vstmia r7!, {s0}
264 ; CHECK-NEXT: le lr, .LBB1_6
265 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
266 ; CHECK-NEXT: cmp.w r8, #3
267 ; CHECK-NEXT: blo .LBB1_10
268 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
269 ; CHECK-NEXT: sub.w r3, r3, r12
270 ; CHECK-NEXT: lsl.w r12, r12, #2
271 ; CHECK-NEXT: .LBB1_9: @ %for.body
272 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
273 ; CHECK-NEXT: add.w r7, r1, r12
274 ; CHECK-NEXT: add.w r6, r0, r12
275 ; CHECK-NEXT: add.w r5, r2, r12
276 ; CHECK-NEXT: adds r0, #16
277 ; CHECK-NEXT: vldr s0, [r7]
278 ; CHECK-NEXT: adds r1, #16
279 ; CHECK-NEXT: vldr s2, [r6]
280 ; CHECK-NEXT: adds r2, #16
281 ; CHECK-NEXT: subs r3, #4
282 ; CHECK-NEXT: vadd.f32 s0, s2, s0
283 ; CHECK-NEXT: vstr s0, [r5]
284 ; CHECK-NEXT: vldr s0, [r7, #4]
285 ; CHECK-NEXT: vldr s2, [r6, #4]
286 ; CHECK-NEXT: vadd.f32 s0, s2, s0
287 ; CHECK-NEXT: vstr s0, [r5, #4]
288 ; CHECK-NEXT: vldr s0, [r7, #8]
289 ; CHECK-NEXT: vldr s2, [r6, #8]
290 ; CHECK-NEXT: vadd.f32 s0, s2, s0
291 ; CHECK-NEXT: vstr s0, [r5, #8]
292 ; CHECK-NEXT: vldr s0, [r7, #12]
293 ; CHECK-NEXT: vldr s2, [r6, #12]
294 ; CHECK-NEXT: vadd.f32 s0, s2, s0
295 ; CHECK-NEXT: vstr s0, [r5, #12]
296 ; CHECK-NEXT: bne .LBB1_9
297 ; CHECK-NEXT: .LBB1_10:
298 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
300 ; CHECK-NEXT: .LBB1_11: @ %vector.ph
301 ; CHECK-NEXT: bic r12, r3, #3
302 ; CHECK-NEXT: movs r6, #1
303 ; CHECK-NEXT: sub.w r7, r12, #4
304 ; CHECK-NEXT: mov r4, r0
305 ; CHECK-NEXT: mov r5, r1
306 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
307 ; CHECK-NEXT: mov r6, r2
308 ; CHECK-NEXT: .LBB1_12: @ %vector.body
309 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
310 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
311 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
312 ; CHECK-NEXT: vadd.f32 q0, q1, q0
313 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
314 ; CHECK-NEXT: le lr, .LBB1_12
315 ; CHECK-NEXT: @ %bb.13: @ %middle.block
316 ; CHECK-NEXT: cmp r12, r3
317 ; CHECK-NEXT: bne .LBB1_4
318 ; CHECK-NEXT: b .LBB1_10
320 %cmp8 = icmp eq i32 %N, 0
321 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
323 for.body.preheader: ; preds = %entry
324 %min.iters.check = icmp ult i32 %N, 4
325 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
327 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
328 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
329 %0 = xor i32 %i.09.ph, -1
331 %xtraiter = and i32 %N, 3
332 %lcmp.mod = icmp eq i32 %xtraiter, 0
333 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
335 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
336 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
337 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
338 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
339 %2 = load float, ptr %arrayidx.prol, align 4
340 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
341 %3 = load float, ptr %arrayidx1.prol, align 4
342 %add.prol = fadd float %2, %3
343 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
344 store float %add.prol, ptr %arrayidx2.prol, align 4
345 %inc.prol = add nuw i32 %i.09.prol, 1
346 %prol.iter.sub = add i32 %prol.iter, -1
347 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
348 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
350 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
351 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
352 %4 = icmp ult i32 %1, 3
353 br i1 %4, label %for.cond.cleanup, label %for.body
355 vector.memcheck: ; preds = %for.body.preheader
356 %scevgep = getelementptr float, ptr %c, i32 %N
357 %scevgep13 = getelementptr float, ptr %a, i32 %N
358 %scevgep16 = getelementptr float, ptr %b, i32 %N
359 %bound0 = icmp ugt ptr %scevgep13, %c
360 %bound1 = icmp ugt ptr %scevgep, %a
361 %found.conflict = and i1 %bound0, %bound1
362 %bound018 = icmp ugt ptr %scevgep16, %c
363 %bound119 = icmp ugt ptr %scevgep, %b
364 %found.conflict20 = and i1 %bound018, %bound119
365 %conflict.rdx = or i1 %found.conflict, %found.conflict20
366 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
368 vector.ph: ; preds = %vector.memcheck
369 %n.vec = and i32 %N, -4
370 br label %vector.body
372 vector.body: ; preds = %vector.body, %vector.ph
373 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
374 %5 = getelementptr inbounds float, ptr %a, i32 %index
375 %wide.load = load <4 x float>, ptr %5, align 4
376 %6 = getelementptr inbounds float, ptr %b, i32 %index
377 %wide.load21 = load <4 x float>, ptr %6, align 4
378 %7 = fadd <4 x float> %wide.load, %wide.load21
379 %8 = getelementptr inbounds float, ptr %c, i32 %index
380 store <4 x float> %7, ptr %8, align 4
381 %index.next = add i32 %index, 4
382 %9 = icmp eq i32 %index.next, %n.vec
383 br i1 %9, label %middle.block, label %vector.body
385 middle.block: ; preds = %vector.body
386 %cmp.n = icmp eq i32 %n.vec, %N
387 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
389 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
392 for.body: ; preds = %for.body.prol.loopexit, %for.body
393 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
394 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
395 %10 = load float, ptr %arrayidx, align 4
396 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
397 %11 = load float, ptr %arrayidx1, align 4
398 %add = fadd float %10, %11
399 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
400 store float %add, ptr %arrayidx2, align 4
401 %inc = add nuw i32 %i.09, 1
402 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
403 %12 = load float, ptr %arrayidx.1, align 4
404 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
405 %13 = load float, ptr %arrayidx1.1, align 4
406 %add.1 = fadd float %12, %13
407 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
408 store float %add.1, ptr %arrayidx2.1, align 4
409 %inc.1 = add nuw i32 %i.09, 2
410 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
411 %14 = load float, ptr %arrayidx.2, align 4
412 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
413 %15 = load float, ptr %arrayidx1.2, align 4
414 %add.2 = fadd float %14, %15
415 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
416 store float %add.2, ptr %arrayidx2.2, align 4
417 %inc.2 = add nuw i32 %i.09, 3
418 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
419 %16 = load float, ptr %arrayidx.3, align 4
420 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
421 %17 = load float, ptr %arrayidx1.3, align 4
422 %add.3 = fadd float %16, %17
423 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
424 store float %add.3, ptr %arrayidx2.3, align 4
425 %inc.3 = add nuw i32 %i.09, 4
426 %exitcond.3 = icmp eq i32 %inc.3, %N
427 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
430 define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
431 ; CHECK-LABEL: float_float_sub:
432 ; CHECK: @ %bb.0: @ %entry
433 ; CHECK-NEXT: cmp r3, #0
435 ; CHECK-NEXT: bxeq lr
436 ; CHECK-NEXT: .LBB2_1: @ %for.body.preheader
437 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
438 ; CHECK-NEXT: cmp r3, #3
439 ; CHECK-NEXT: bhi .LBB2_3
440 ; CHECK-NEXT: @ %bb.2:
441 ; CHECK-NEXT: mov.w r12, #0
442 ; CHECK-NEXT: b .LBB2_4
443 ; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
444 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
445 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
446 ; CHECK-NEXT: cmp r7, r2
447 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
448 ; CHECK-NEXT: cset r7, hi
449 ; CHECK-NEXT: cmp r6, r1
450 ; CHECK-NEXT: csel r7, zr, r7, ls
451 ; CHECK-NEXT: cmp r6, r0
452 ; CHECK-NEXT: cset r6, hi
453 ; CHECK-NEXT: cmp r5, r2
454 ; CHECK-NEXT: cset r5, hi
455 ; CHECK-NEXT: mov.w r12, #0
456 ; CHECK-NEXT: tst r5, r6
458 ; CHECK-NEXT: cmpeq r7, #0
459 ; CHECK-NEXT: beq .LBB2_11
460 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
461 ; CHECK-NEXT: mvn.w r7, r12
462 ; CHECK-NEXT: add.w r8, r7, r3
463 ; CHECK-NEXT: and r5, r3, #3
464 ; CHECK-NEXT: wls lr, r5, .LBB2_7
465 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
466 ; CHECK-NEXT: add.w r4, r12, r5
467 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
468 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
469 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
470 ; CHECK-NEXT: mov r12, r4
471 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol
472 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
473 ; CHECK-NEXT: vldmia r6!, {s0}
474 ; CHECK-NEXT: vldmia r5!, {s2}
475 ; CHECK-NEXT: vsub.f32 s0, s2, s0
476 ; CHECK-NEXT: vstmia r7!, {s0}
477 ; CHECK-NEXT: le lr, .LBB2_6
478 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
479 ; CHECK-NEXT: cmp.w r8, #3
480 ; CHECK-NEXT: blo .LBB2_10
481 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
482 ; CHECK-NEXT: sub.w r3, r3, r12
483 ; CHECK-NEXT: lsl.w r12, r12, #2
484 ; CHECK-NEXT: .LBB2_9: @ %for.body
485 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
486 ; CHECK-NEXT: add.w r7, r1, r12
487 ; CHECK-NEXT: add.w r6, r0, r12
488 ; CHECK-NEXT: add.w r5, r2, r12
489 ; CHECK-NEXT: adds r0, #16
490 ; CHECK-NEXT: vldr s0, [r7]
491 ; CHECK-NEXT: adds r1, #16
492 ; CHECK-NEXT: vldr s2, [r6]
493 ; CHECK-NEXT: adds r2, #16
494 ; CHECK-NEXT: subs r3, #4
495 ; CHECK-NEXT: vsub.f32 s0, s2, s0
496 ; CHECK-NEXT: vstr s0, [r5]
497 ; CHECK-NEXT: vldr s0, [r7, #4]
498 ; CHECK-NEXT: vldr s2, [r6, #4]
499 ; CHECK-NEXT: vsub.f32 s0, s2, s0
500 ; CHECK-NEXT: vstr s0, [r5, #4]
501 ; CHECK-NEXT: vldr s0, [r7, #8]
502 ; CHECK-NEXT: vldr s2, [r6, #8]
503 ; CHECK-NEXT: vsub.f32 s0, s2, s0
504 ; CHECK-NEXT: vstr s0, [r5, #8]
505 ; CHECK-NEXT: vldr s0, [r7, #12]
506 ; CHECK-NEXT: vldr s2, [r6, #12]
507 ; CHECK-NEXT: vsub.f32 s0, s2, s0
508 ; CHECK-NEXT: vstr s0, [r5, #12]
509 ; CHECK-NEXT: bne .LBB2_9
510 ; CHECK-NEXT: .LBB2_10:
511 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
513 ; CHECK-NEXT: .LBB2_11: @ %vector.ph
514 ; CHECK-NEXT: bic r12, r3, #3
515 ; CHECK-NEXT: movs r6, #1
516 ; CHECK-NEXT: sub.w r7, r12, #4
517 ; CHECK-NEXT: mov r4, r0
518 ; CHECK-NEXT: mov r5, r1
519 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
520 ; CHECK-NEXT: mov r6, r2
521 ; CHECK-NEXT: .LBB2_12: @ %vector.body
522 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
523 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
524 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
525 ; CHECK-NEXT: vsub.f32 q0, q1, q0
526 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
527 ; CHECK-NEXT: le lr, .LBB2_12
528 ; CHECK-NEXT: @ %bb.13: @ %middle.block
529 ; CHECK-NEXT: cmp r12, r3
530 ; CHECK-NEXT: bne .LBB2_4
531 ; CHECK-NEXT: b .LBB2_10
533 %cmp8 = icmp eq i32 %N, 0
534 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
536 for.body.preheader: ; preds = %entry
537 %min.iters.check = icmp ult i32 %N, 4
538 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
540 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
541 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
542 %0 = xor i32 %i.09.ph, -1
544 %xtraiter = and i32 %N, 3
545 %lcmp.mod = icmp eq i32 %xtraiter, 0
546 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
548 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
549 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
550 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
551 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
552 %2 = load float, ptr %arrayidx.prol, align 4
553 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
554 %3 = load float, ptr %arrayidx1.prol, align 4
555 %sub.prol = fsub float %2, %3
556 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
557 store float %sub.prol, ptr %arrayidx2.prol, align 4
558 %inc.prol = add nuw i32 %i.09.prol, 1
559 %prol.iter.sub = add i32 %prol.iter, -1
560 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
561 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
563 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
564 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
565 %4 = icmp ult i32 %1, 3
566 br i1 %4, label %for.cond.cleanup, label %for.body
568 vector.memcheck: ; preds = %for.body.preheader
569 %scevgep = getelementptr float, ptr %c, i32 %N
570 %scevgep13 = getelementptr float, ptr %a, i32 %N
571 %scevgep16 = getelementptr float, ptr %b, i32 %N
572 %bound0 = icmp ugt ptr %scevgep13, %c
573 %bound1 = icmp ugt ptr %scevgep, %a
574 %found.conflict = and i1 %bound0, %bound1
575 %bound018 = icmp ugt ptr %scevgep16, %c
576 %bound119 = icmp ugt ptr %scevgep, %b
577 %found.conflict20 = and i1 %bound018, %bound119
578 %conflict.rdx = or i1 %found.conflict, %found.conflict20
579 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
581 vector.ph: ; preds = %vector.memcheck
582 %n.vec = and i32 %N, -4
583 br label %vector.body
585 vector.body: ; preds = %vector.body, %vector.ph
586 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
587 %5 = getelementptr inbounds float, ptr %a, i32 %index
588 %wide.load = load <4 x float>, ptr %5, align 4
589 %6 = getelementptr inbounds float, ptr %b, i32 %index
590 %wide.load21 = load <4 x float>, ptr %6, align 4
591 %7 = fsub <4 x float> %wide.load, %wide.load21
592 %8 = getelementptr inbounds float, ptr %c, i32 %index
593 store <4 x float> %7, ptr %8, align 4
594 %index.next = add i32 %index, 4
595 %9 = icmp eq i32 %index.next, %n.vec
596 br i1 %9, label %middle.block, label %vector.body
598 middle.block: ; preds = %vector.body
599 %cmp.n = icmp eq i32 %n.vec, %N
600 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
602 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
605 for.body: ; preds = %for.body.prol.loopexit, %for.body
606 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
607 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
608 %10 = load float, ptr %arrayidx, align 4
609 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
610 %11 = load float, ptr %arrayidx1, align 4
611 %sub = fsub float %10, %11
612 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
613 store float %sub, ptr %arrayidx2, align 4
614 %inc = add nuw i32 %i.09, 1
615 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
616 %12 = load float, ptr %arrayidx.1, align 4
617 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
618 %13 = load float, ptr %arrayidx1.1, align 4
619 %sub.1 = fsub float %12, %13
620 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
621 store float %sub.1, ptr %arrayidx2.1, align 4
622 %inc.1 = add nuw i32 %i.09, 2
623 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
624 %14 = load float, ptr %arrayidx.2, align 4
625 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
626 %15 = load float, ptr %arrayidx1.2, align 4
627 %sub.2 = fsub float %14, %15
628 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
629 store float %sub.2, ptr %arrayidx2.2, align 4
630 %inc.2 = add nuw i32 %i.09, 3
631 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
632 %16 = load float, ptr %arrayidx.3, align 4
633 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
634 %17 = load float, ptr %arrayidx1.3, align 4
635 %sub.3 = fsub float %16, %17
636 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
637 store float %sub.3, ptr %arrayidx2.3, align 4
638 %inc.3 = add nuw i32 %i.09, 4
639 %exitcond.3 = icmp eq i32 %inc.3, %N
640 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
643 define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
644 ; CHECK-LABEL: float_int_mul:
645 ; CHECK: @ %bb.0: @ %entry
646 ; CHECK-NEXT: cmp r3, #0
648 ; CHECK-NEXT: bxeq lr
649 ; CHECK-NEXT: .LBB3_1: @ %for.body.preheader
650 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
651 ; CHECK-NEXT: cmp r3, #3
652 ; CHECK-NEXT: bls .LBB3_6
653 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
654 ; CHECK-NEXT: add.w r7, r0, r3, lsl #2
655 ; CHECK-NEXT: cmp r7, r2
657 ; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
658 ; CHECK-NEXT: cmphi r7, r0
659 ; CHECK-NEXT: bhi .LBB3_6
660 ; CHECK-NEXT: @ %bb.3: @ %vector.ph
661 ; CHECK-NEXT: bic r12, r3, #3
662 ; CHECK-NEXT: movs r6, #1
663 ; CHECK-NEXT: sub.w r7, r12, #4
664 ; CHECK-NEXT: mov r4, r0
665 ; CHECK-NEXT: mov r5, r1
666 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
667 ; CHECK-NEXT: mov r6, r2
668 ; CHECK-NEXT: .LBB3_4: @ %vector.body
669 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
670 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
671 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
672 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
673 ; CHECK-NEXT: vmul.f32 q0, q1, q0
674 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
675 ; CHECK-NEXT: le lr, .LBB3_4
676 ; CHECK-NEXT: @ %bb.5: @ %middle.block
677 ; CHECK-NEXT: cmp r12, r3
678 ; CHECK-NEXT: bne .LBB3_7
679 ; CHECK-NEXT: b .LBB3_13
680 ; CHECK-NEXT: .LBB3_6:
681 ; CHECK-NEXT: mov.w r12, #0
682 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
683 ; CHECK-NEXT: mvn.w r7, r12
684 ; CHECK-NEXT: add.w r8, r7, r3
685 ; CHECK-NEXT: and r5, r3, #3
686 ; CHECK-NEXT: wls lr, r5, .LBB3_10
687 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
688 ; CHECK-NEXT: add.w r4, r12, r5
689 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
690 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
691 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
692 ; CHECK-NEXT: mov r12, r4
693 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol
694 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
695 ; CHECK-NEXT: ldr r4, [r6], #4
696 ; CHECK-NEXT: vldmia r5!, {s2}
697 ; CHECK-NEXT: vmov s0, r4
698 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
699 ; CHECK-NEXT: vmul.f32 s0, s2, s0
700 ; CHECK-NEXT: vstmia r7!, {s0}
701 ; CHECK-NEXT: le lr, .LBB3_9
702 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
703 ; CHECK-NEXT: cmp.w r8, #3
704 ; CHECK-NEXT: blo .LBB3_13
705 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
706 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
707 ; CHECK-NEXT: sub.w r3, r3, r12
708 ; CHECK-NEXT: adds r1, #8
709 ; CHECK-NEXT: lsl.w r12, r12, #2
710 ; CHECK-NEXT: .LBB3_12: @ %for.body
711 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
712 ; CHECK-NEXT: vldr s0, [r1, #-8]
713 ; CHECK-NEXT: add.w r7, r0, r12
714 ; CHECK-NEXT: add.w r6, r2, r12
715 ; CHECK-NEXT: adds r0, #16
716 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
717 ; CHECK-NEXT: vldr s2, [r7]
718 ; CHECK-NEXT: adds r2, #16
719 ; CHECK-NEXT: subs r3, #4
720 ; CHECK-NEXT: vmul.f32 s0, s2, s0
721 ; CHECK-NEXT: vstr s0, [r6]
722 ; CHECK-NEXT: vldr s0, [r1, #-4]
723 ; CHECK-NEXT: vldr s2, [r7, #4]
724 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
725 ; CHECK-NEXT: vmul.f32 s0, s2, s0
726 ; CHECK-NEXT: vstr s0, [r6, #4]
727 ; CHECK-NEXT: vldr s0, [r1]
728 ; CHECK-NEXT: vldr s2, [r7, #8]
729 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
730 ; CHECK-NEXT: vmul.f32 s0, s2, s0
731 ; CHECK-NEXT: vstr s0, [r6, #8]
732 ; CHECK-NEXT: vldr s0, [r1, #4]
733 ; CHECK-NEXT: add.w r1, r1, #16
734 ; CHECK-NEXT: vldr s2, [r7, #12]
735 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
736 ; CHECK-NEXT: vmul.f32 s0, s2, s0
737 ; CHECK-NEXT: vstr s0, [r6, #12]
738 ; CHECK-NEXT: bne .LBB3_12
739 ; CHECK-NEXT: .LBB3_13:
740 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
743 %cmp8 = icmp eq i32 %N, 0
744 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
746 for.body.preheader: ; preds = %entry
747 %min.iters.check = icmp ult i32 %N, 4
748 br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
750 for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
751 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
752 %0 = xor i32 %i.09.ph, -1
754 %xtraiter = and i32 %N, 3
755 %lcmp.mod = icmp eq i32 %xtraiter, 0
756 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
758 for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
759 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
760 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
761 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
762 %2 = load float, ptr %arrayidx.prol, align 4
763 %arrayidx1.prol = getelementptr inbounds i32, ptr %b, i32 %i.09.prol
764 %3 = load i32, ptr %arrayidx1.prol, align 4
765 %conv.prol = sitofp i32 %3 to float
766 %mul.prol = fmul float %2, %conv.prol
767 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
768 store float %mul.prol, ptr %arrayidx2.prol, align 4
769 %inc.prol = add nuw i32 %i.09.prol, 1
770 %prol.iter.sub = add i32 %prol.iter, -1
771 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
772 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
774 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
775 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
776 %4 = icmp ult i32 %1, 3
777 br i1 %4, label %for.cond.cleanup, label %for.body
779 vector.memcheck: ; preds = %for.body.preheader
780 %scevgep = getelementptr float, ptr %c, i32 %N
781 %scevgep13 = getelementptr float, ptr %a, i32 %N
782 %bound0 = icmp ugt ptr %scevgep13, %c
783 %bound1 = icmp ugt ptr %scevgep, %a
784 %found.conflict = and i1 %bound0, %bound1
785 br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
787 vector.ph: ; preds = %vector.memcheck
788 %n.vec = and i32 %N, -4
789 br label %vector.body
791 vector.body: ; preds = %vector.body, %vector.ph
792 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
793 %5 = getelementptr inbounds float, ptr %a, i32 %index
794 %wide.load = load <4 x float>, ptr %5, align 4
795 %6 = getelementptr inbounds i32, ptr %b, i32 %index
796 %wide.load15 = load <4 x i32>, ptr %6, align 4
797 %7 = sitofp <4 x i32> %wide.load15 to <4 x float>
798 %8 = fmul <4 x float> %wide.load, %7
799 %9 = getelementptr inbounds float, ptr %c, i32 %index
800 store <4 x float> %8, ptr %9, align 4
801 %index.next = add i32 %index, 4
802 %10 = icmp eq i32 %index.next, %n.vec
803 br i1 %10, label %middle.block, label %vector.body
805 middle.block: ; preds = %vector.body
806 %cmp.n = icmp eq i32 %n.vec, %N
807 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
809 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
812 for.body: ; preds = %for.body.prol.loopexit, %for.body
813 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
814 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
815 %11 = load float, ptr %arrayidx, align 4
816 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
817 %12 = load i32, ptr %arrayidx1, align 4
818 %conv = sitofp i32 %12 to float
819 %mul = fmul float %11, %conv
820 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
821 store float %mul, ptr %arrayidx2, align 4
822 %inc = add nuw i32 %i.09, 1
823 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
824 %13 = load float, ptr %arrayidx.1, align 4
825 %arrayidx1.1 = getelementptr inbounds i32, ptr %b, i32 %inc
826 %14 = load i32, ptr %arrayidx1.1, align 4
827 %conv.1 = sitofp i32 %14 to float
828 %mul.1 = fmul float %13, %conv.1
829 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
830 store float %mul.1, ptr %arrayidx2.1, align 4
831 %inc.1 = add nuw i32 %i.09, 2
832 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
833 %15 = load float, ptr %arrayidx.2, align 4
834 %arrayidx1.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1
835 %16 = load i32, ptr %arrayidx1.2, align 4
836 %conv.2 = sitofp i32 %16 to float
837 %mul.2 = fmul float %15, %conv.2
838 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
839 store float %mul.2, ptr %arrayidx2.2, align 4
840 %inc.2 = add nuw i32 %i.09, 3
841 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
842 %17 = load float, ptr %arrayidx.3, align 4
843 %arrayidx1.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2
844 %18 = load i32, ptr %arrayidx1.3, align 4
845 %conv.3 = sitofp i32 %18 to float
846 %mul.3 = fmul float %17, %conv.3
847 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
848 store float %mul.3, ptr %arrayidx2.3, align 4
849 %inc.3 = add nuw i32 %i.09, 4
850 %exitcond.3 = icmp eq i32 %inc.3, %N
851 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
854 define arm_aapcs_vfpcc void @float_int_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
855 ; CHECK-LABEL: float_int_int_mul:
856 ; CHECK: @ %bb.0: @ %entry
857 ; CHECK-NEXT: push {r4, r5, r6, lr}
858 ; CHECK-NEXT: cbz r3, .LBB4_8
859 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
860 ; CHECK-NEXT: cmp r3, #3
861 ; CHECK-NEXT: bhi .LBB4_3
862 ; CHECK-NEXT: @ %bb.2:
863 ; CHECK-NEXT: mov.w r12, #0
864 ; CHECK-NEXT: b .LBB4_6
865 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
866 ; CHECK-NEXT: bic r12, r3, #3
867 ; CHECK-NEXT: movs r5, #1
868 ; CHECK-NEXT: sub.w r6, r12, #4
869 ; CHECK-NEXT: mov r4, r0
870 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
871 ; CHECK-NEXT: mov r5, r1
872 ; CHECK-NEXT: mov r6, r2
873 ; CHECK-NEXT: .LBB4_4: @ %vector.body
874 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
875 ; CHECK-NEXT: vldrw.u32 q0, [r4], #16
876 ; CHECK-NEXT: vldrw.u32 q1, [r5], #16
877 ; CHECK-NEXT: vmul.i32 q0, q1, q0
878 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
879 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
880 ; CHECK-NEXT: le lr, .LBB4_4
881 ; CHECK-NEXT: @ %bb.5: @ %middle.block
882 ; CHECK-NEXT: cmp r12, r3
884 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
885 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
886 ; CHECK-NEXT: sub.w lr, r3, r12
887 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2
888 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
889 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
890 ; CHECK-NEXT: .LBB4_7: @ %for.body
891 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
892 ; CHECK-NEXT: ldr r3, [r0], #4
893 ; CHECK-NEXT: ldr r6, [r1], #4
894 ; CHECK-NEXT: muls r3, r6, r3
895 ; CHECK-NEXT: vmov s0, r3
896 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
897 ; CHECK-NEXT: vstmia r2!, {s0}
898 ; CHECK-NEXT: le lr, .LBB4_7
899 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
900 ; CHECK-NEXT: pop {r4, r5, r6, pc}
902 %cmp8 = icmp eq i32 %N, 0
903 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
905 for.body.preheader: ; preds = %entry
906 %min.iters.check = icmp ult i32 %N, 4
907 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
909 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
910 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
913 vector.ph: ; preds = %for.body.preheader
914 %n.vec = and i32 %N, -4
915 br label %vector.body
917 vector.body: ; preds = %vector.body, %vector.ph
918 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
919 %0 = getelementptr inbounds i32, ptr %a, i32 %index
920 %wide.load = load <4 x i32>, ptr %0, align 4
921 %1 = getelementptr inbounds i32, ptr %b, i32 %index
922 %wide.load10 = load <4 x i32>, ptr %1, align 4
923 %2 = mul nsw <4 x i32> %wide.load10, %wide.load
924 %3 = sitofp <4 x i32> %2 to <4 x float>
925 %4 = getelementptr inbounds float, ptr %c, i32 %index
926 store <4 x float> %3, ptr %4, align 4
927 %index.next = add i32 %index, 4
928 %5 = icmp eq i32 %index.next, %n.vec
929 br i1 %5, label %middle.block, label %vector.body
931 middle.block: ; preds = %vector.body
932 %cmp.n = icmp eq i32 %n.vec, %N
933 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
935 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
938 for.body: ; preds = %for.body.preheader11, %for.body
939 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
940 %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.09
941 %6 = load i32, ptr %arrayidx, align 4
942 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
943 %7 = load i32, ptr %arrayidx1, align 4
944 %mul = mul nsw i32 %7, %6
945 %conv = sitofp i32 %mul to float
946 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
947 store float %conv, ptr %arrayidx2, align 4
948 %inc = add nuw i32 %i.09, 1
949 %exitcond = icmp eq i32 %inc, %N
950 br i1 %exitcond, label %for.cond.cleanup, label %for.body
953 define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
954 ; CHECK-LABEL: half_half_mul:
955 ; CHECK: @ %bb.0: @ %entry
956 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
957 ; CHECK-NEXT: cmp r3, #0
958 ; CHECK-NEXT: beq .LBB5_8
959 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
960 ; CHECK-NEXT: cmp r3, #3
961 ; CHECK-NEXT: bhi .LBB5_3
962 ; CHECK-NEXT: @ %bb.2:
963 ; CHECK-NEXT: mov.w r12, #0
964 ; CHECK-NEXT: b .LBB5_6
965 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
966 ; CHECK-NEXT: bic r12, r3, #3
967 ; CHECK-NEXT: movs r5, #1
968 ; CHECK-NEXT: sub.w r6, r12, #4
969 ; CHECK-NEXT: mov r4, r0
970 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
971 ; CHECK-NEXT: mov r5, r1
972 ; CHECK-NEXT: mov r6, r2
973 ; CHECK-NEXT: .LBB5_4: @ %vector.body
974 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
975 ; CHECK-NEXT: ldr.w r9, [r4]
976 ; CHECK-NEXT: ldr r7, [r5]
977 ; CHECK-NEXT: ldr.w r8, [r4, #4]
978 ; CHECK-NEXT: vmov.32 q0[0], r9
979 ; CHECK-NEXT: ldr.w r10, [r5, #4]
980 ; CHECK-NEXT: vmov.32 q1[0], r7
981 ; CHECK-NEXT: vmov.32 q0[1], r8
982 ; CHECK-NEXT: adds r4, #8
983 ; CHECK-NEXT: vmov.32 q1[1], r10
984 ; CHECK-NEXT: adds r5, #8
985 ; CHECK-NEXT: vmul.f16 q0, q0, q1
986 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
987 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
988 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
989 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
990 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
991 ; CHECK-NEXT: le lr, .LBB5_4
992 ; CHECK-NEXT: @ %bb.5: @ %middle.block
993 ; CHECK-NEXT: cmp r12, r3
994 ; CHECK-NEXT: beq .LBB5_8
995 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
996 ; CHECK-NEXT: sub.w lr, r3, r12
997 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
998 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
999 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1000 ; CHECK-NEXT: .LBB5_7: @ %for.body
1001 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1002 ; CHECK-NEXT: vldr.16 s0, [r1]
1003 ; CHECK-NEXT: vldr.16 s2, [r0]
1004 ; CHECK-NEXT: adds r0, #2
1005 ; CHECK-NEXT: adds r1, #2
1006 ; CHECK-NEXT: vmul.f16 s0, s2, s0
1007 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1008 ; CHECK-NEXT: vstmia r2!, {s0}
1009 ; CHECK-NEXT: le lr, .LBB5_7
1010 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
1011 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1013 %cmp8 = icmp eq i32 %N, 0
1014 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1016 for.body.preheader: ; preds = %entry
1017 %min.iters.check = icmp ult i32 %N, 4
1018 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1020 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1021 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1024 vector.ph: ; preds = %for.body.preheader
1025 %n.vec = and i32 %N, -4
1026 br label %vector.body
1028 vector.body: ; preds = %vector.body, %vector.ph
1029 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1030 %0 = getelementptr inbounds half, ptr %a, i32 %index
1031 %wide.load = load <4 x half>, ptr %0, align 2
1032 %1 = getelementptr inbounds half, ptr %b, i32 %index
1033 %wide.load10 = load <4 x half>, ptr %1, align 2
1034 %2 = fmul <4 x half> %wide.load, %wide.load10
1035 %3 = fpext <4 x half> %2 to <4 x float>
1036 %4 = getelementptr inbounds float, ptr %c, i32 %index
1037 store <4 x float> %3, ptr %4, align 4
1038 %index.next = add i32 %index, 4
1039 %5 = icmp eq i32 %index.next, %n.vec
1040 br i1 %5, label %middle.block, label %vector.body
1042 middle.block: ; preds = %vector.body
1043 %cmp.n = icmp eq i32 %n.vec, %N
1044 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1046 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1049 for.body: ; preds = %for.body.preheader11, %for.body
1050 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1051 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1052 %6 = load half, ptr %arrayidx, align 2
1053 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1054 %7 = load half, ptr %arrayidx1, align 2
1055 %mul = fmul half %6, %7
1056 %conv = fpext half %mul to float
1057 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1058 store float %conv, ptr %arrayidx2, align 4
1059 %inc = add nuw i32 %i.09, 1
1060 %exitcond = icmp eq i32 %inc, %N
1061 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1064 define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1065 ; CHECK-LABEL: half_half_add:
1066 ; CHECK: @ %bb.0: @ %entry
1067 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1068 ; CHECK-NEXT: cmp r3, #0
1069 ; CHECK-NEXT: beq .LBB6_8
1070 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1071 ; CHECK-NEXT: cmp r3, #3
1072 ; CHECK-NEXT: bhi .LBB6_3
1073 ; CHECK-NEXT: @ %bb.2:
1074 ; CHECK-NEXT: mov.w r12, #0
1075 ; CHECK-NEXT: b .LBB6_6
1076 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1077 ; CHECK-NEXT: bic r12, r3, #3
1078 ; CHECK-NEXT: movs r5, #1
1079 ; CHECK-NEXT: sub.w r6, r12, #4
1080 ; CHECK-NEXT: mov r4, r0
1081 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1082 ; CHECK-NEXT: mov r5, r1
1083 ; CHECK-NEXT: mov r6, r2
1084 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1085 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1086 ; CHECK-NEXT: ldr.w r9, [r4]
1087 ; CHECK-NEXT: ldr r7, [r5]
1088 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1089 ; CHECK-NEXT: vmov.32 q0[0], r9
1090 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1091 ; CHECK-NEXT: vmov.32 q1[0], r7
1092 ; CHECK-NEXT: vmov.32 q0[1], r8
1093 ; CHECK-NEXT: adds r4, #8
1094 ; CHECK-NEXT: vmov.32 q1[1], r10
1095 ; CHECK-NEXT: adds r5, #8
1096 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1097 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1098 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1099 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1100 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1101 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1102 ; CHECK-NEXT: le lr, .LBB6_4
1103 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1104 ; CHECK-NEXT: cmp r12, r3
1105 ; CHECK-NEXT: beq .LBB6_8
1106 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
1107 ; CHECK-NEXT: sub.w lr, r3, r12
1108 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1109 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1110 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1111 ; CHECK-NEXT: .LBB6_7: @ %for.body
1112 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1113 ; CHECK-NEXT: vldr.16 s0, [r1]
1114 ; CHECK-NEXT: vldr.16 s2, [r0]
1115 ; CHECK-NEXT: adds r0, #2
1116 ; CHECK-NEXT: adds r1, #2
1117 ; CHECK-NEXT: vadd.f16 s0, s2, s0
1118 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1119 ; CHECK-NEXT: vstmia r2!, {s0}
1120 ; CHECK-NEXT: le lr, .LBB6_7
1121 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1122 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1124 %cmp8 = icmp eq i32 %N, 0
1125 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1127 for.body.preheader: ; preds = %entry
1128 %min.iters.check = icmp ult i32 %N, 4
1129 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1131 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1132 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1135 vector.ph: ; preds = %for.body.preheader
1136 %n.vec = and i32 %N, -4
1137 br label %vector.body
1139 vector.body: ; preds = %vector.body, %vector.ph
1140 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1141 %0 = getelementptr inbounds half, ptr %a, i32 %index
1142 %wide.load = load <4 x half>, ptr %0, align 2
1143 %1 = getelementptr inbounds half, ptr %b, i32 %index
1144 %wide.load10 = load <4 x half>, ptr %1, align 2
1145 %2 = fadd <4 x half> %wide.load, %wide.load10
1146 %3 = fpext <4 x half> %2 to <4 x float>
1147 %4 = getelementptr inbounds float, ptr %c, i32 %index
1148 store <4 x float> %3, ptr %4, align 4
1149 %index.next = add i32 %index, 4
1150 %5 = icmp eq i32 %index.next, %n.vec
1151 br i1 %5, label %middle.block, label %vector.body
1153 middle.block: ; preds = %vector.body
1154 %cmp.n = icmp eq i32 %n.vec, %N
1155 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1157 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1160 for.body: ; preds = %for.body.preheader11, %for.body
1161 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1162 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1163 %6 = load half, ptr %arrayidx, align 2
1164 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1165 %7 = load half, ptr %arrayidx1, align 2
1166 %add = fadd half %6, %7
1167 %conv = fpext half %add to float
1168 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1169 store float %conv, ptr %arrayidx2, align 4
1170 %inc = add nuw i32 %i.09, 1
1171 %exitcond = icmp eq i32 %inc, %N
1172 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1175 define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1176 ; CHECK-LABEL: half_half_sub:
1177 ; CHECK: @ %bb.0: @ %entry
1178 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1179 ; CHECK-NEXT: cmp r3, #0
1180 ; CHECK-NEXT: beq .LBB7_8
1181 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1182 ; CHECK-NEXT: cmp r3, #3
1183 ; CHECK-NEXT: bhi .LBB7_3
1184 ; CHECK-NEXT: @ %bb.2:
1185 ; CHECK-NEXT: mov.w r12, #0
1186 ; CHECK-NEXT: b .LBB7_6
1187 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1188 ; CHECK-NEXT: bic r12, r3, #3
1189 ; CHECK-NEXT: movs r5, #1
1190 ; CHECK-NEXT: sub.w r6, r12, #4
1191 ; CHECK-NEXT: mov r4, r0
1192 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1193 ; CHECK-NEXT: mov r5, r1
1194 ; CHECK-NEXT: mov r6, r2
1195 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1196 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1197 ; CHECK-NEXT: ldr.w r9, [r4]
1198 ; CHECK-NEXT: ldr r7, [r5]
1199 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1200 ; CHECK-NEXT: vmov.32 q0[0], r9
1201 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1202 ; CHECK-NEXT: vmov.32 q1[0], r7
1203 ; CHECK-NEXT: vmov.32 q0[1], r8
1204 ; CHECK-NEXT: adds r4, #8
1205 ; CHECK-NEXT: vmov.32 q1[1], r10
1206 ; CHECK-NEXT: adds r5, #8
1207 ; CHECK-NEXT: vsub.f16 q0, q0, q1
1208 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1209 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1210 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1211 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1212 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1213 ; CHECK-NEXT: le lr, .LBB7_4
1214 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1215 ; CHECK-NEXT: cmp r12, r3
1216 ; CHECK-NEXT: beq .LBB7_8
1217 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
1218 ; CHECK-NEXT: sub.w lr, r3, r12
1219 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1220 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1221 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1222 ; CHECK-NEXT: .LBB7_7: @ %for.body
1223 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1224 ; CHECK-NEXT: vldr.16 s0, [r1]
1225 ; CHECK-NEXT: vldr.16 s2, [r0]
1226 ; CHECK-NEXT: adds r0, #2
1227 ; CHECK-NEXT: adds r1, #2
1228 ; CHECK-NEXT: vsub.f16 s0, s2, s0
1229 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1230 ; CHECK-NEXT: vstmia r2!, {s0}
1231 ; CHECK-NEXT: le lr, .LBB7_7
1232 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1233 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1235 %cmp8 = icmp eq i32 %N, 0
1236 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1238 for.body.preheader: ; preds = %entry
1239 %min.iters.check = icmp ult i32 %N, 4
1240 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1242 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1243 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1246 vector.ph: ; preds = %for.body.preheader
1247 %n.vec = and i32 %N, -4
1248 br label %vector.body
1250 vector.body: ; preds = %vector.body, %vector.ph
1251 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1252 %0 = getelementptr inbounds half, ptr %a, i32 %index
1253 %wide.load = load <4 x half>, ptr %0, align 2
1254 %1 = getelementptr inbounds half, ptr %b, i32 %index
1255 %wide.load10 = load <4 x half>, ptr %1, align 2
1256 %2 = fsub <4 x half> %wide.load, %wide.load10
1257 %3 = fpext <4 x half> %2 to <4 x float>
1258 %4 = getelementptr inbounds float, ptr %c, i32 %index
1259 store <4 x float> %3, ptr %4, align 4
1260 %index.next = add i32 %index, 4
1261 %5 = icmp eq i32 %index.next, %n.vec
1262 br i1 %5, label %middle.block, label %vector.body
1264 middle.block: ; preds = %vector.body
1265 %cmp.n = icmp eq i32 %n.vec, %N
1266 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1268 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1271 for.body: ; preds = %for.body.preheader11, %for.body
1272 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1273 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1274 %6 = load half, ptr %arrayidx, align 2
1275 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1276 %7 = load half, ptr %arrayidx1, align 2
1277 %sub = fsub half %6, %7
1278 %conv = fpext half %sub to float
1279 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1280 store float %conv, ptr %arrayidx2, align 4
1281 %inc = add nuw i32 %i.09, 1
1282 %exitcond = icmp eq i32 %inc, %N
1283 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1286 define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1287 ; CHECK-LABEL: half_short_mul:
1288 ; CHECK: @ %bb.0: @ %entry
1289 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
1290 ; CHECK-NEXT: sub sp, #16
1291 ; CHECK-NEXT: cmp r3, #0
1292 ; CHECK-NEXT: beq .LBB8_8
1293 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1294 ; CHECK-NEXT: mov r8, r2
1295 ; CHECK-NEXT: mov r9, r1
1296 ; CHECK-NEXT: cmp r3, #3
1297 ; CHECK-NEXT: bhi .LBB8_3
1298 ; CHECK-NEXT: @ %bb.2:
1299 ; CHECK-NEXT: mov.w r12, #0
1300 ; CHECK-NEXT: b .LBB8_6
1301 ; CHECK-NEXT: .LBB8_3: @ %vector.ph
1302 ; CHECK-NEXT: bic r12, r3, #3
1303 ; CHECK-NEXT: movs r6, #1
1304 ; CHECK-NEXT: sub.w r7, r12, #4
1305 ; CHECK-NEXT: mov r1, sp
1306 ; CHECK-NEXT: mov r5, r0
1307 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
1308 ; CHECK-NEXT: mov r6, r9
1309 ; CHECK-NEXT: mov r7, r8
1310 ; CHECK-NEXT: .LBB8_4: @ %vector.body
1311 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1312 ; CHECK-NEXT: vldrh.u32 q0, [r6], #8
1313 ; CHECK-NEXT: ldr r4, [r5]
1314 ; CHECK-NEXT: ldr r2, [r5, #4]
1315 ; CHECK-NEXT: adds r5, #8
1316 ; CHECK-NEXT: vstrh.32 q0, [r1]
1317 ; CHECK-NEXT: vmov.32 q1[0], r4
1318 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1319 ; CHECK-NEXT: vmov.32 q1[1], r2
1320 ; CHECK-NEXT: vcvt.f16.s16 q0, q0
1321 ; CHECK-NEXT: vmul.f16 q0, q1, q0
1322 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1323 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1324 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1325 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1326 ; CHECK-NEXT: vstrb.8 q0, [r7], #16
1327 ; CHECK-NEXT: le lr, .LBB8_4
1328 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1329 ; CHECK-NEXT: cmp r12, r3
1330 ; CHECK-NEXT: beq .LBB8_8
1331 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
1332 ; CHECK-NEXT: sub.w lr, r3, r12
1333 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1334 ; CHECK-NEXT: add.w r1, r9, r12, lsl #1
1335 ; CHECK-NEXT: add.w r2, r8, r12, lsl #2
1336 ; CHECK-NEXT: .LBB8_7: @ %for.body
1337 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1338 ; CHECK-NEXT: ldrsh r3, [r1], #2
1339 ; CHECK-NEXT: vldr.16 s0, [r0]
1340 ; CHECK-NEXT: adds r0, #2
1341 ; CHECK-NEXT: vmov s2, r3
1342 ; CHECK-NEXT: vcvt.f16.s32 s2, s2
1343 ; CHECK-NEXT: vmul.f16 s0, s0, s2
1344 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1345 ; CHECK-NEXT: vstmia r2!, {s0}
1346 ; CHECK-NEXT: le lr, .LBB8_7
1347 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
1348 ; CHECK-NEXT: add sp, #16
1349 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
1351 %cmp10 = icmp eq i32 %N, 0
1352 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1354 for.body.preheader: ; preds = %entry
1355 %min.iters.check = icmp ult i32 %N, 4
1356 br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
1358 for.body.preheader13: ; preds = %middle.block, %for.body.preheader
1359 %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1362 vector.ph: ; preds = %for.body.preheader
1363 %n.vec = and i32 %N, -4
1364 br label %vector.body
1366 vector.body: ; preds = %vector.body, %vector.ph
1367 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1368 %0 = getelementptr inbounds half, ptr %a, i32 %index
1369 %wide.load = load <4 x half>, ptr %0, align 2
1370 %1 = getelementptr inbounds i16, ptr %b, i32 %index
1371 %wide.load12 = load <4 x i16>, ptr %1, align 2
1372 %2 = sitofp <4 x i16> %wide.load12 to <4 x half>
1373 %3 = fmul <4 x half> %wide.load, %2
1374 %4 = fpext <4 x half> %3 to <4 x float>
1375 %5 = getelementptr inbounds float, ptr %c, i32 %index
1376 store <4 x float> %4, ptr %5, align 4
1377 %index.next = add i32 %index, 4
1378 %6 = icmp eq i32 %index.next, %n.vec
1379 br i1 %6, label %middle.block, label %vector.body
1381 middle.block: ; preds = %vector.body
1382 %cmp.n = icmp eq i32 %n.vec, %N
1383 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
1385 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1388 for.body: ; preds = %for.body.preheader13, %for.body
1389 %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
1390 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1391 %7 = load half, ptr %arrayidx, align 2
1392 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.011
1393 %8 = load i16, ptr %arrayidx1, align 2
1394 %conv2 = sitofp i16 %8 to half
1395 %mul = fmul half %7, %conv2
1396 %conv3 = fpext half %mul to float
1397 %arrayidx4 = getelementptr inbounds float, ptr %c, i32 %i.011
1398 store float %conv3, ptr %arrayidx4, align 4
1399 %inc = add nuw i32 %i.011, 1
1400 %exitcond = icmp eq i32 %inc, %N
1401 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1404 define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1405 ; CHECK-LABEL: half_half_mac:
1406 ; CHECK: @ %bb.0: @ %entry
1407 ; CHECK-NEXT: push {r4, r5, r7, lr}
1408 ; CHECK-NEXT: cbz r2, .LBB9_3
1409 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1410 ; CHECK-NEXT: subs r3, r2, #1
1411 ; CHECK-NEXT: and r12, r2, #3
1412 ; CHECK-NEXT: cmp r3, #3
1413 ; CHECK-NEXT: bhs .LBB9_4
1414 ; CHECK-NEXT: @ %bb.2:
1415 ; CHECK-NEXT: vldr s0, .LCPI9_0
1416 ; CHECK-NEXT: movs r2, #0
1417 ; CHECK-NEXT: b .LBB9_6
1418 ; CHECK-NEXT: .LBB9_3:
1419 ; CHECK-NEXT: vldr s0, .LCPI9_0
1420 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1421 ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
1422 ; CHECK-NEXT: bic r2, r2, #3
1423 ; CHECK-NEXT: movs r3, #1
1424 ; CHECK-NEXT: subs r2, #4
1425 ; CHECK-NEXT: vldr s0, .LCPI9_0
1426 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1427 ; CHECK-NEXT: movs r3, #0
1428 ; CHECK-NEXT: movs r2, #0
1429 ; CHECK-NEXT: .LBB9_5: @ %for.body
1430 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1431 ; CHECK-NEXT: adds r5, r0, r3
1432 ; CHECK-NEXT: adds r4, r1, r3
1433 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1434 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1435 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1436 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1437 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1438 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1439 ; CHECK-NEXT: vldr.16 s10, [r5]
1440 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1441 ; CHECK-NEXT: vmul.f16 s4, s6, s4
1442 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1443 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1444 ; CHECK-NEXT: adds r3, #8
1445 ; CHECK-NEXT: vmul.f16 s6, s8, s6
1446 ; CHECK-NEXT: vldr.16 s8, [r4]
1447 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1448 ; CHECK-NEXT: adds r2, #4
1449 ; CHECK-NEXT: vmul.f16 s8, s10, s8
1450 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1451 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1452 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1453 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1454 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1455 ; CHECK-NEXT: le lr, .LBB9_5
1456 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1457 ; CHECK-NEXT: wls lr, r12, .LBB9_9
1458 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1459 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1460 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1461 ; CHECK-NEXT: .LBB9_8: @ %for.body.epil
1462 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1463 ; CHECK-NEXT: vldr.16 s2, [r1]
1464 ; CHECK-NEXT: vldr.16 s4, [r0]
1465 ; CHECK-NEXT: adds r0, #2
1466 ; CHECK-NEXT: adds r1, #2
1467 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1468 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1469 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1470 ; CHECK-NEXT: le lr, .LBB9_8
1471 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
1472 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1473 ; CHECK-NEXT: .p2align 2
1474 ; CHECK-NEXT: @ %bb.10:
1475 ; CHECK-NEXT: .LCPI9_0:
1476 ; CHECK-NEXT: .long 0x00000000 @ float 0
1478 %cmp8 = icmp eq i32 %N, 0
1479 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1481 for.body.preheader: ; preds = %entry
1483 %xtraiter = and i32 %N, 3
1484 %1 = icmp ult i32 %0, 3
1485 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1487 for.body.preheader.new: ; preds = %for.body.preheader
1488 %unroll_iter = sub i32 %N, %xtraiter
1491 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1492 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1493 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1494 %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1495 %lcmp.mod = icmp eq i32 %xtraiter, 0
1496 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1498 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1499 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1500 %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1501 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1502 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.010.epil
1503 %2 = load half, ptr %arrayidx.epil, align 2
1504 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.010.epil
1505 %3 = load half, ptr %arrayidx1.epil, align 2
1506 %mul.epil = fmul half %2, %3
1507 %conv.epil = fpext half %mul.epil to float
1508 %add.epil = fadd float %res.09.epil, %conv.epil
1509 %inc.epil = add nuw i32 %i.010.epil, 1
1510 %epil.iter.sub = add i32 %epil.iter, -1
1511 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1512 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1514 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1515 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1516 ret float %res.0.lcssa
1518 for.body: ; preds = %for.body, %for.body.preheader.new
1519 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1520 %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1521 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1522 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.010
1523 %4 = load half, ptr %arrayidx, align 2
1524 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.010
1525 %5 = load half, ptr %arrayidx1, align 2
1526 %mul = fmul half %4, %5
1527 %conv = fpext half %mul to float
1528 %add = fadd float %res.09, %conv
1529 %inc = or disjoint i32 %i.010, 1
1530 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1531 %6 = load half, ptr %arrayidx.1, align 2
1532 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1533 %7 = load half, ptr %arrayidx1.1, align 2
1534 %mul.1 = fmul half %6, %7
1535 %conv.1 = fpext half %mul.1 to float
1536 %add.1 = fadd float %add, %conv.1
1537 %inc.1 = or disjoint i32 %i.010, 2
1538 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1539 %8 = load half, ptr %arrayidx.2, align 2
1540 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1541 %9 = load half, ptr %arrayidx1.2, align 2
1542 %mul.2 = fmul half %8, %9
1543 %conv.2 = fpext half %mul.2 to float
1544 %add.2 = fadd float %add.1, %conv.2
1545 %inc.2 = or disjoint i32 %i.010, 3
1546 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1547 %10 = load half, ptr %arrayidx.3, align 2
1548 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1549 %11 = load half, ptr %arrayidx1.3, align 2
1550 %mul.3 = fmul half %10, %11
1551 %conv.3 = fpext half %mul.3 to float
1552 %add.3 = fadd float %add.2, %conv.3
1553 %inc.3 = add nuw i32 %i.010, 4
1554 %niter.nsub.3 = add i32 %niter, -4
1555 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1556 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1559 define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1560 ; CHECK-LABEL: half_half_acc:
1561 ; CHECK: @ %bb.0: @ %entry
1562 ; CHECK-NEXT: push {r4, r5, r7, lr}
1563 ; CHECK-NEXT: cbz r2, .LBB10_3
1564 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1565 ; CHECK-NEXT: subs r3, r2, #1
1566 ; CHECK-NEXT: and r12, r2, #3
1567 ; CHECK-NEXT: cmp r3, #3
1568 ; CHECK-NEXT: bhs .LBB10_4
1569 ; CHECK-NEXT: @ %bb.2:
1570 ; CHECK-NEXT: vldr s0, .LCPI10_0
1571 ; CHECK-NEXT: movs r2, #0
1572 ; CHECK-NEXT: b .LBB10_6
1573 ; CHECK-NEXT: .LBB10_3:
1574 ; CHECK-NEXT: vldr s0, .LCPI10_0
1575 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1576 ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
1577 ; CHECK-NEXT: bic r2, r2, #3
1578 ; CHECK-NEXT: movs r3, #1
1579 ; CHECK-NEXT: subs r2, #4
1580 ; CHECK-NEXT: vldr s0, .LCPI10_0
1581 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1582 ; CHECK-NEXT: movs r3, #0
1583 ; CHECK-NEXT: movs r2, #0
1584 ; CHECK-NEXT: .LBB10_5: @ %for.body
1585 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1586 ; CHECK-NEXT: adds r5, r0, r3
1587 ; CHECK-NEXT: adds r4, r1, r3
1588 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1589 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1590 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1591 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1592 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1593 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1594 ; CHECK-NEXT: vldr.16 s10, [r5]
1595 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1596 ; CHECK-NEXT: vadd.f16 s4, s6, s4
1597 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1598 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1599 ; CHECK-NEXT: adds r3, #8
1600 ; CHECK-NEXT: vadd.f16 s6, s8, s6
1601 ; CHECK-NEXT: vldr.16 s8, [r4]
1602 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1603 ; CHECK-NEXT: adds r2, #4
1604 ; CHECK-NEXT: vadd.f16 s8, s10, s8
1605 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1606 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1607 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1608 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1609 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1610 ; CHECK-NEXT: le lr, .LBB10_5
1611 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1612 ; CHECK-NEXT: wls lr, r12, .LBB10_9
1613 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1614 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1615 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1616 ; CHECK-NEXT: .LBB10_8: @ %for.body.epil
1617 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1618 ; CHECK-NEXT: vldr.16 s2, [r1]
1619 ; CHECK-NEXT: vldr.16 s4, [r0]
1620 ; CHECK-NEXT: adds r0, #2
1621 ; CHECK-NEXT: adds r1, #2
1622 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1623 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1624 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1625 ; CHECK-NEXT: le lr, .LBB10_8
1626 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
1627 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1628 ; CHECK-NEXT: .p2align 2
1629 ; CHECK-NEXT: @ %bb.10:
1630 ; CHECK-NEXT: .LCPI10_0:
1631 ; CHECK-NEXT: .long 0x00000000 @ float 0
1633 %cmp9 = icmp eq i32 %N, 0
1634 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
1636 for.body.preheader: ; preds = %entry
1638 %xtraiter = and i32 %N, 3
1639 %1 = icmp ult i32 %0, 3
1640 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1642 for.body.preheader.new: ; preds = %for.body.preheader
1643 %unroll_iter = sub i32 %N, %xtraiter
1646 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1647 %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
1648 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1649 %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
1650 %lcmp.mod = icmp eq i32 %xtraiter, 0
1651 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1653 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1654 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1655 %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1656 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1657 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.011.epil
1658 %2 = load half, ptr %arrayidx.epil, align 2
1659 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.011.epil
1660 %3 = load half, ptr %arrayidx1.epil, align 2
1661 %add.epil = fadd half %2, %3
1662 %conv.epil = fpext half %add.epil to float
1663 %add2.epil = fadd float %res.010.epil, %conv.epil
1664 %inc.epil = add nuw i32 %i.011.epil, 1
1665 %epil.iter.sub = add i32 %epil.iter, -1
1666 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1667 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1669 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1670 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
1671 ret float %res.0.lcssa
1673 for.body: ; preds = %for.body, %for.body.preheader.new
1674 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1675 %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
1676 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1677 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1678 %4 = load half, ptr %arrayidx, align 2
1679 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.011
1680 %5 = load half, ptr %arrayidx1, align 2
1681 %add = fadd half %4, %5
1682 %conv = fpext half %add to float
1683 %add2 = fadd float %res.010, %conv
1684 %inc = or disjoint i32 %i.011, 1
1685 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1686 %6 = load half, ptr %arrayidx.1, align 2
1687 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1688 %7 = load half, ptr %arrayidx1.1, align 2
1689 %add.1 = fadd half %6, %7
1690 %conv.1 = fpext half %add.1 to float
1691 %add2.1 = fadd float %add2, %conv.1
1692 %inc.1 = or disjoint i32 %i.011, 2
1693 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1694 %8 = load half, ptr %arrayidx.2, align 2
1695 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1696 %9 = load half, ptr %arrayidx1.2, align 2
1697 %add.2 = fadd half %8, %9
1698 %conv.2 = fpext half %add.2 to float
1699 %add2.2 = fadd float %add2.1, %conv.2
1700 %inc.2 = or disjoint i32 %i.011, 3
1701 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1702 %10 = load half, ptr %arrayidx.3, align 2
1703 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1704 %11 = load half, ptr %arrayidx1.3, align 2
1705 %add.3 = fadd half %10, %11
1706 %conv.3 = fpext half %add.3 to float
1707 %add2.3 = fadd float %add2.2, %conv.3
1708 %inc.3 = add nuw i32 %i.011, 4
1709 %niter.nsub.3 = add i32 %niter, -4
1710 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1711 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1714 define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1715 ; CHECK-LABEL: half_short_mac:
1716 ; CHECK: @ %bb.0: @ %entry
1717 ; CHECK-NEXT: push {r4, r5, r6, lr}
1718 ; CHECK-NEXT: cbz r2, .LBB11_3
1719 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1720 ; CHECK-NEXT: subs r3, r2, #1
1721 ; CHECK-NEXT: and r12, r2, #3
1722 ; CHECK-NEXT: cmp r3, #3
1723 ; CHECK-NEXT: bhs .LBB11_4
1724 ; CHECK-NEXT: @ %bb.2:
1725 ; CHECK-NEXT: vldr s0, .LCPI11_0
1726 ; CHECK-NEXT: movs r2, #0
1727 ; CHECK-NEXT: b .LBB11_6
1728 ; CHECK-NEXT: .LBB11_3:
1729 ; CHECK-NEXT: vldr s0, .LCPI11_0
1730 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1731 ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
1732 ; CHECK-NEXT: bic r2, r2, #3
1733 ; CHECK-NEXT: movs r3, #1
1734 ; CHECK-NEXT: subs r2, #4
1735 ; CHECK-NEXT: vldr s0, .LCPI11_0
1736 ; CHECK-NEXT: adds r4, r0, #4
1737 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1738 ; CHECK-NEXT: adds r3, r1, #4
1739 ; CHECK-NEXT: movs r2, #0
1740 ; CHECK-NEXT: .LBB11_5: @ %for.body
1741 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1742 ; CHECK-NEXT: ldrsh.w r5, [r3, #2]
1743 ; CHECK-NEXT: vldr.16 s2, [r4, #2]
1744 ; CHECK-NEXT: adds r2, #4
1745 ; CHECK-NEXT: vmov s4, r5
1746 ; CHECK-NEXT: ldrsh r5, [r3], #8
1747 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1748 ; CHECK-NEXT: ldrsh r6, [r3, #-10]
1749 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1750 ; CHECK-NEXT: vmov s6, r5
1751 ; CHECK-NEXT: vldr.16 s4, [r4]
1752 ; CHECK-NEXT: vcvt.f16.s32 s6, s6
1753 ; CHECK-NEXT: ldrsh r5, [r3, #-12]
1754 ; CHECK-NEXT: vmul.f16 s4, s4, s6
1755 ; CHECK-NEXT: vmov s8, r6
1756 ; CHECK-NEXT: vldr.16 s6, [r4, #-2]
1757 ; CHECK-NEXT: vcvt.f16.s32 s8, s8
1758 ; CHECK-NEXT: vmov s10, r5
1759 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1760 ; CHECK-NEXT: vmul.f16 s6, s6, s8
1761 ; CHECK-NEXT: vldr.16 s8, [r4, #-4]
1762 ; CHECK-NEXT: vcvt.f16.s32 s10, s10
1763 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1764 ; CHECK-NEXT: vmul.f16 s8, s8, s10
1765 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1766 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1767 ; CHECK-NEXT: adds r4, #8
1768 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1769 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1770 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1771 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1772 ; CHECK-NEXT: le lr, .LBB11_5
1773 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1774 ; CHECK-NEXT: wls lr, r12, .LBB11_9
1775 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1776 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1777 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1778 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil
1779 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1780 ; CHECK-NEXT: ldrsh r2, [r1], #2
1781 ; CHECK-NEXT: vldr.16 s2, [r0]
1782 ; CHECK-NEXT: adds r0, #2
1783 ; CHECK-NEXT: vmov s4, r2
1784 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1785 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1786 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1787 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1788 ; CHECK-NEXT: le lr, .LBB11_8
1789 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1790 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1791 ; CHECK-NEXT: .p2align 2
1792 ; CHECK-NEXT: @ %bb.10:
1793 ; CHECK-NEXT: .LCPI11_0:
1794 ; CHECK-NEXT: .long 0x00000000 @ float 0
1796 %cmp10 = icmp eq i32 %N, 0
1797 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1799 for.body.preheader: ; preds = %entry
1801 %xtraiter = and i32 %N, 3
1802 %1 = icmp ult i32 %0, 3
1803 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1805 for.body.preheader.new: ; preds = %for.body.preheader
1806 %unroll_iter = sub i32 %N, %xtraiter
1809 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1810 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1811 %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1812 %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1813 %lcmp.mod = icmp eq i32 %xtraiter, 0
1814 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1816 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1817 %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1818 %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1819 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1820 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.012.epil
1821 %2 = load half, ptr %arrayidx.epil, align 2
1822 %arrayidx1.epil = getelementptr inbounds i16, ptr %b, i32 %i.012.epil
1823 %3 = load i16, ptr %arrayidx1.epil, align 2
1824 %conv2.epil = sitofp i16 %3 to half
1825 %mul.epil = fmul half %2, %conv2.epil
1826 %conv3.epil = fpext half %mul.epil to float
1827 %add.epil = fadd float %res.011.epil, %conv3.epil
1828 %inc.epil = add nuw i32 %i.012.epil, 1
1829 %epil.iter.sub = add i32 %epil.iter, -1
1830 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1831 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1833 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1834 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1835 ret float %res.0.lcssa
1837 for.body: ; preds = %for.body, %for.body.preheader.new
1838 %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1839 %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1840 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1841 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.012
1842 %4 = load half, ptr %arrayidx, align 2
1843 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.012
1844 %5 = load i16, ptr %arrayidx1, align 2
1845 %conv2 = sitofp i16 %5 to half
1846 %mul = fmul half %4, %conv2
1847 %conv3 = fpext half %mul to float
1848 %add = fadd float %res.011, %conv3
1849 %inc = or disjoint i32 %i.012, 1
1850 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1851 %6 = load half, ptr %arrayidx.1, align 2
1852 %arrayidx1.1 = getelementptr inbounds i16, ptr %b, i32 %inc
1853 %7 = load i16, ptr %arrayidx1.1, align 2
1854 %conv2.1 = sitofp i16 %7 to half
1855 %mul.1 = fmul half %6, %conv2.1
1856 %conv3.1 = fpext half %mul.1 to float
1857 %add.1 = fadd float %add, %conv3.1
1858 %inc.1 = or disjoint i32 %i.012, 2
1859 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1860 %8 = load half, ptr %arrayidx.2, align 2
1861 %arrayidx1.2 = getelementptr inbounds i16, ptr %b, i32 %inc.1
1862 %9 = load i16, ptr %arrayidx1.2, align 2
1863 %conv2.2 = sitofp i16 %9 to half
1864 %mul.2 = fmul half %8, %conv2.2
1865 %conv3.2 = fpext half %mul.2 to float
1866 %add.2 = fadd float %add.1, %conv3.2
1867 %inc.2 = or disjoint i32 %i.012, 3
1868 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1869 %10 = load half, ptr %arrayidx.3, align 2
1870 %arrayidx1.3 = getelementptr inbounds i16, ptr %b, i32 %inc.2
1871 %11 = load i16, ptr %arrayidx1.3, align 2
1872 %conv2.3 = sitofp i16 %11 to half
1873 %mul.3 = fmul half %10, %conv2.3
1874 %conv3.3 = fpext half %mul.3 to float
1875 %add.3 = fadd float %add.2, %conv3.3
1876 %inc.3 = add nuw i32 %i.012, 4
1877 %niter.nsub.3 = add i32 %niter, -4
1878 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1879 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body