1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
5 ; CHECK-LABEL: float_float_mul:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
8 ; CHECK-NEXT: cmp r3, #0
9 ; CHECK-NEXT: beq .LBB0_10
10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
11 ; CHECK-NEXT: cmp r3, #3
12 ; CHECK-NEXT: bhi .LBB0_3
13 ; CHECK-NEXT: @ %bb.2:
14 ; CHECK-NEXT: mov.w r12, #0
15 ; CHECK-NEXT: b .LBB0_4
16 ; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
17 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
18 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
19 ; CHECK-NEXT: cmp r7, r2
20 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
21 ; CHECK-NEXT: cset r7, hi
22 ; CHECK-NEXT: cmp r6, r1
23 ; CHECK-NEXT: csel r7, zr, r7, ls
24 ; CHECK-NEXT: cmp r6, r0
25 ; CHECK-NEXT: cset r6, hi
26 ; CHECK-NEXT: cmp r5, r2
27 ; CHECK-NEXT: cset r5, hi
28 ; CHECK-NEXT: mov.w r12, #0
29 ; CHECK-NEXT: tst r5, r6
31 ; CHECK-NEXT: cmpeq r7, #0
32 ; CHECK-NEXT: beq .LBB0_11
33 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
34 ; CHECK-NEXT: mvn.w r7, r12
35 ; CHECK-NEXT: adds r4, r7, r3
36 ; CHECK-NEXT: and r7, r3, #3
37 ; CHECK-NEXT: add.w r8, r12, r7
38 ; CHECK-NEXT: wls lr, r7, .LBB0_7
39 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
40 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
41 ; CHECK-NEXT: add.w r7, r1, r12, lsl #2
42 ; CHECK-NEXT: add.w r5, r2, r12, lsl #2
43 ; CHECK-NEXT: mov r12, r8
44 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol
45 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
46 ; CHECK-NEXT: vldmia r7!, {s0}
47 ; CHECK-NEXT: vldmia r6!, {s2}
48 ; CHECK-NEXT: vmul.f32 s0, s2, s0
49 ; CHECK-NEXT: vstmia r5!, {s0}
50 ; CHECK-NEXT: le lr, .LBB0_6
51 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
52 ; CHECK-NEXT: cmp r4, #3
53 ; CHECK-NEXT: blo .LBB0_10
54 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
55 ; CHECK-NEXT: sub.w r3, r8, r3
56 ; CHECK-NEXT: movs r7, #1
57 ; CHECK-NEXT: rsb r3, r3, r3, lsl #30
58 ; CHECK-NEXT: subs r3, #4
59 ; CHECK-NEXT: add.w lr, r7, r3, lsr #2
60 ; CHECK-NEXT: lsl.w r3, r12, #2
61 ; CHECK-NEXT: .LBB0_9: @ %for.body
62 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
63 ; CHECK-NEXT: adds r7, r1, r3
64 ; CHECK-NEXT: adds r6, r0, r3
65 ; CHECK-NEXT: adds r5, r2, r3
66 ; CHECK-NEXT: adds r0, #16
67 ; CHECK-NEXT: vldr s0, [r7]
68 ; CHECK-NEXT: adds r1, #16
69 ; CHECK-NEXT: vldr s2, [r6]
70 ; CHECK-NEXT: adds r2, #16
71 ; CHECK-NEXT: vmul.f32 s0, s2, s0
72 ; CHECK-NEXT: vstr s0, [r5]
73 ; CHECK-NEXT: vldr s0, [r7, #4]
74 ; CHECK-NEXT: vldr s2, [r6, #4]
75 ; CHECK-NEXT: vmul.f32 s0, s2, s0
76 ; CHECK-NEXT: vstr s0, [r5, #4]
77 ; CHECK-NEXT: vldr s0, [r7, #8]
78 ; CHECK-NEXT: vldr s2, [r6, #8]
79 ; CHECK-NEXT: vmul.f32 s0, s2, s0
80 ; CHECK-NEXT: vstr s0, [r5, #8]
81 ; CHECK-NEXT: vldr s0, [r7, #12]
82 ; CHECK-NEXT: vldr s2, [r6, #12]
83 ; CHECK-NEXT: vmul.f32 s0, s2, s0
84 ; CHECK-NEXT: vstr s0, [r5, #12]
85 ; CHECK-NEXT: le lr, .LBB0_9
86 ; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup
87 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
88 ; CHECK-NEXT: .LBB0_11: @ %vector.ph
89 ; CHECK-NEXT: bic r12, r3, #3
90 ; CHECK-NEXT: movs r6, #1
91 ; CHECK-NEXT: sub.w r7, r12, #4
92 ; CHECK-NEXT: mov r4, r0
93 ; CHECK-NEXT: mov r5, r1
94 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
95 ; CHECK-NEXT: mov r6, r2
96 ; CHECK-NEXT: .LBB0_12: @ %vector.body
97 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
98 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
99 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
100 ; CHECK-NEXT: vmul.f32 q0, q1, q0
101 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
102 ; CHECK-NEXT: le lr, .LBB0_12
103 ; CHECK-NEXT: @ %bb.13: @ %middle.block
104 ; CHECK-NEXT: cmp r12, r3
105 ; CHECK-NEXT: bne .LBB0_4
106 ; CHECK-NEXT: b .LBB0_10
108 %cmp8 = icmp eq i32 %N, 0
109 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
111 for.body.preheader: ; preds = %entry
112 %min.iters.check = icmp ult i32 %N, 4
113 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
115 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
116 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
117 %0 = xor i32 %i.09.ph, -1
119 %xtraiter = and i32 %N, 3
120 %lcmp.mod = icmp eq i32 %xtraiter, 0
121 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
123 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
124 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
125 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
126 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
127 %2 = load float, ptr %arrayidx.prol, align 4
128 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
129 %3 = load float, ptr %arrayidx1.prol, align 4
130 %mul.prol = fmul float %2, %3
131 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
132 store float %mul.prol, ptr %arrayidx2.prol, align 4
133 %inc.prol = add nuw i32 %i.09.prol, 1
134 %prol.iter.sub = add i32 %prol.iter, -1
135 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
136 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
138 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
139 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
140 %4 = icmp ult i32 %1, 3
141 br i1 %4, label %for.cond.cleanup, label %for.body
143 vector.memcheck: ; preds = %for.body.preheader
144 %scevgep = getelementptr float, ptr %c, i32 %N
145 %scevgep13 = getelementptr float, ptr %a, i32 %N
146 %scevgep16 = getelementptr float, ptr %b, i32 %N
147 %bound0 = icmp ugt ptr %scevgep13, %c
148 %bound1 = icmp ugt ptr %scevgep, %a
149 %found.conflict = and i1 %bound0, %bound1
150 %bound018 = icmp ugt ptr %scevgep16, %c
151 %bound119 = icmp ugt ptr %scevgep, %b
152 %found.conflict20 = and i1 %bound018, %bound119
153 %conflict.rdx = or i1 %found.conflict, %found.conflict20
154 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
156 vector.ph: ; preds = %vector.memcheck
157 %n.vec = and i32 %N, -4
158 br label %vector.body
160 vector.body: ; preds = %vector.body, %vector.ph
161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
162 %5 = getelementptr inbounds float, ptr %a, i32 %index
163 %wide.load = load <4 x float>, ptr %5, align 4
164 %6 = getelementptr inbounds float, ptr %b, i32 %index
165 %wide.load21 = load <4 x float>, ptr %6, align 4
166 %7 = fmul <4 x float> %wide.load, %wide.load21
167 %8 = getelementptr inbounds float, ptr %c, i32 %index
168 store <4 x float> %7, ptr %8, align 4
169 %index.next = add i32 %index, 4
170 %9 = icmp eq i32 %index.next, %n.vec
171 br i1 %9, label %middle.block, label %vector.body
173 middle.block: ; preds = %vector.body
174 %cmp.n = icmp eq i32 %n.vec, %N
175 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
177 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
180 for.body: ; preds = %for.body.prol.loopexit, %for.body
181 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
182 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
183 %10 = load float, ptr %arrayidx, align 4
184 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
185 %11 = load float, ptr %arrayidx1, align 4
186 %mul = fmul float %10, %11
187 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
188 store float %mul, ptr %arrayidx2, align 4
189 %inc = add nuw i32 %i.09, 1
190 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
191 %12 = load float, ptr %arrayidx.1, align 4
192 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
193 %13 = load float, ptr %arrayidx1.1, align 4
194 %mul.1 = fmul float %12, %13
195 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
196 store float %mul.1, ptr %arrayidx2.1, align 4
197 %inc.1 = add nuw i32 %i.09, 2
198 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
199 %14 = load float, ptr %arrayidx.2, align 4
200 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
201 %15 = load float, ptr %arrayidx1.2, align 4
202 %mul.2 = fmul float %14, %15
203 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
204 store float %mul.2, ptr %arrayidx2.2, align 4
205 %inc.2 = add nuw i32 %i.09, 3
206 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
207 %16 = load float, ptr %arrayidx.3, align 4
208 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
209 %17 = load float, ptr %arrayidx1.3, align 4
210 %mul.3 = fmul float %16, %17
211 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
212 store float %mul.3, ptr %arrayidx2.3, align 4
213 %inc.3 = add nuw i32 %i.09, 4
214 %exitcond.3 = icmp eq i32 %inc.3, %N
215 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
218 define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
219 ; CHECK-LABEL: float_float_add:
220 ; CHECK: @ %bb.0: @ %entry
221 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
222 ; CHECK-NEXT: cmp r3, #0
223 ; CHECK-NEXT: beq .LBB1_10
224 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
225 ; CHECK-NEXT: cmp r3, #3
226 ; CHECK-NEXT: bhi .LBB1_3
227 ; CHECK-NEXT: @ %bb.2:
228 ; CHECK-NEXT: mov.w r12, #0
229 ; CHECK-NEXT: b .LBB1_4
230 ; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
231 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
232 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
233 ; CHECK-NEXT: cmp r7, r2
234 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
235 ; CHECK-NEXT: cset r7, hi
236 ; CHECK-NEXT: cmp r6, r1
237 ; CHECK-NEXT: csel r7, zr, r7, ls
238 ; CHECK-NEXT: cmp r6, r0
239 ; CHECK-NEXT: cset r6, hi
240 ; CHECK-NEXT: cmp r5, r2
241 ; CHECK-NEXT: cset r5, hi
242 ; CHECK-NEXT: mov.w r12, #0
243 ; CHECK-NEXT: tst r5, r6
245 ; CHECK-NEXT: cmpeq r7, #0
246 ; CHECK-NEXT: beq .LBB1_11
247 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
248 ; CHECK-NEXT: mvn.w r7, r12
249 ; CHECK-NEXT: adds r4, r7, r3
250 ; CHECK-NEXT: and r7, r3, #3
251 ; CHECK-NEXT: add.w r8, r12, r7
252 ; CHECK-NEXT: wls lr, r7, .LBB1_7
253 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
254 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
255 ; CHECK-NEXT: add.w r7, r1, r12, lsl #2
256 ; CHECK-NEXT: add.w r5, r2, r12, lsl #2
257 ; CHECK-NEXT: mov r12, r8
258 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol
259 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
260 ; CHECK-NEXT: vldmia r7!, {s0}
261 ; CHECK-NEXT: vldmia r6!, {s2}
262 ; CHECK-NEXT: vadd.f32 s0, s2, s0
263 ; CHECK-NEXT: vstmia r5!, {s0}
264 ; CHECK-NEXT: le lr, .LBB1_6
265 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
266 ; CHECK-NEXT: cmp r4, #3
267 ; CHECK-NEXT: blo .LBB1_10
268 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
269 ; CHECK-NEXT: sub.w r3, r8, r3
270 ; CHECK-NEXT: movs r7, #1
271 ; CHECK-NEXT: rsb r3, r3, r3, lsl #30
272 ; CHECK-NEXT: subs r3, #4
273 ; CHECK-NEXT: add.w lr, r7, r3, lsr #2
274 ; CHECK-NEXT: lsl.w r3, r12, #2
275 ; CHECK-NEXT: .LBB1_9: @ %for.body
276 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
277 ; CHECK-NEXT: adds r7, r1, r3
278 ; CHECK-NEXT: adds r6, r0, r3
279 ; CHECK-NEXT: adds r5, r2, r3
280 ; CHECK-NEXT: adds r0, #16
281 ; CHECK-NEXT: vldr s0, [r7]
282 ; CHECK-NEXT: adds r1, #16
283 ; CHECK-NEXT: vldr s2, [r6]
284 ; CHECK-NEXT: adds r2, #16
285 ; CHECK-NEXT: vadd.f32 s0, s2, s0
286 ; CHECK-NEXT: vstr s0, [r5]
287 ; CHECK-NEXT: vldr s0, [r7, #4]
288 ; CHECK-NEXT: vldr s2, [r6, #4]
289 ; CHECK-NEXT: vadd.f32 s0, s2, s0
290 ; CHECK-NEXT: vstr s0, [r5, #4]
291 ; CHECK-NEXT: vldr s0, [r7, #8]
292 ; CHECK-NEXT: vldr s2, [r6, #8]
293 ; CHECK-NEXT: vadd.f32 s0, s2, s0
294 ; CHECK-NEXT: vstr s0, [r5, #8]
295 ; CHECK-NEXT: vldr s0, [r7, #12]
296 ; CHECK-NEXT: vldr s2, [r6, #12]
297 ; CHECK-NEXT: vadd.f32 s0, s2, s0
298 ; CHECK-NEXT: vstr s0, [r5, #12]
299 ; CHECK-NEXT: le lr, .LBB1_9
300 ; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup
301 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
302 ; CHECK-NEXT: .LBB1_11: @ %vector.ph
303 ; CHECK-NEXT: bic r12, r3, #3
304 ; CHECK-NEXT: movs r6, #1
305 ; CHECK-NEXT: sub.w r7, r12, #4
306 ; CHECK-NEXT: mov r4, r0
307 ; CHECK-NEXT: mov r5, r1
308 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
309 ; CHECK-NEXT: mov r6, r2
310 ; CHECK-NEXT: .LBB1_12: @ %vector.body
311 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
312 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
313 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
314 ; CHECK-NEXT: vadd.f32 q0, q1, q0
315 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
316 ; CHECK-NEXT: le lr, .LBB1_12
317 ; CHECK-NEXT: @ %bb.13: @ %middle.block
318 ; CHECK-NEXT: cmp r12, r3
319 ; CHECK-NEXT: bne .LBB1_4
320 ; CHECK-NEXT: b .LBB1_10
322 %cmp8 = icmp eq i32 %N, 0
323 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
325 for.body.preheader: ; preds = %entry
326 %min.iters.check = icmp ult i32 %N, 4
327 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
329 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
330 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
331 %0 = xor i32 %i.09.ph, -1
333 %xtraiter = and i32 %N, 3
334 %lcmp.mod = icmp eq i32 %xtraiter, 0
335 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
337 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
338 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
339 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
340 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
341 %2 = load float, ptr %arrayidx.prol, align 4
342 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
343 %3 = load float, ptr %arrayidx1.prol, align 4
344 %add.prol = fadd float %2, %3
345 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
346 store float %add.prol, ptr %arrayidx2.prol, align 4
347 %inc.prol = add nuw i32 %i.09.prol, 1
348 %prol.iter.sub = add i32 %prol.iter, -1
349 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
350 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
352 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
353 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
354 %4 = icmp ult i32 %1, 3
355 br i1 %4, label %for.cond.cleanup, label %for.body
357 vector.memcheck: ; preds = %for.body.preheader
358 %scevgep = getelementptr float, ptr %c, i32 %N
359 %scevgep13 = getelementptr float, ptr %a, i32 %N
360 %scevgep16 = getelementptr float, ptr %b, i32 %N
361 %bound0 = icmp ugt ptr %scevgep13, %c
362 %bound1 = icmp ugt ptr %scevgep, %a
363 %found.conflict = and i1 %bound0, %bound1
364 %bound018 = icmp ugt ptr %scevgep16, %c
365 %bound119 = icmp ugt ptr %scevgep, %b
366 %found.conflict20 = and i1 %bound018, %bound119
367 %conflict.rdx = or i1 %found.conflict, %found.conflict20
368 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
370 vector.ph: ; preds = %vector.memcheck
371 %n.vec = and i32 %N, -4
372 br label %vector.body
374 vector.body: ; preds = %vector.body, %vector.ph
375 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
376 %5 = getelementptr inbounds float, ptr %a, i32 %index
377 %wide.load = load <4 x float>, ptr %5, align 4
378 %6 = getelementptr inbounds float, ptr %b, i32 %index
379 %wide.load21 = load <4 x float>, ptr %6, align 4
380 %7 = fadd <4 x float> %wide.load, %wide.load21
381 %8 = getelementptr inbounds float, ptr %c, i32 %index
382 store <4 x float> %7, ptr %8, align 4
383 %index.next = add i32 %index, 4
384 %9 = icmp eq i32 %index.next, %n.vec
385 br i1 %9, label %middle.block, label %vector.body
387 middle.block: ; preds = %vector.body
388 %cmp.n = icmp eq i32 %n.vec, %N
389 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
391 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
394 for.body: ; preds = %for.body.prol.loopexit, %for.body
395 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
396 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
397 %10 = load float, ptr %arrayidx, align 4
398 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
399 %11 = load float, ptr %arrayidx1, align 4
400 %add = fadd float %10, %11
401 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
402 store float %add, ptr %arrayidx2, align 4
403 %inc = add nuw i32 %i.09, 1
404 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
405 %12 = load float, ptr %arrayidx.1, align 4
406 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
407 %13 = load float, ptr %arrayidx1.1, align 4
408 %add.1 = fadd float %12, %13
409 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
410 store float %add.1, ptr %arrayidx2.1, align 4
411 %inc.1 = add nuw i32 %i.09, 2
412 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
413 %14 = load float, ptr %arrayidx.2, align 4
414 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
415 %15 = load float, ptr %arrayidx1.2, align 4
416 %add.2 = fadd float %14, %15
417 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
418 store float %add.2, ptr %arrayidx2.2, align 4
419 %inc.2 = add nuw i32 %i.09, 3
420 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
421 %16 = load float, ptr %arrayidx.3, align 4
422 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
423 %17 = load float, ptr %arrayidx1.3, align 4
424 %add.3 = fadd float %16, %17
425 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
426 store float %add.3, ptr %arrayidx2.3, align 4
427 %inc.3 = add nuw i32 %i.09, 4
428 %exitcond.3 = icmp eq i32 %inc.3, %N
429 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
432 define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
433 ; CHECK-LABEL: float_float_sub:
434 ; CHECK: @ %bb.0: @ %entry
435 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
436 ; CHECK-NEXT: cmp r3, #0
437 ; CHECK-NEXT: beq .LBB2_10
438 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
439 ; CHECK-NEXT: cmp r3, #3
440 ; CHECK-NEXT: bhi .LBB2_3
441 ; CHECK-NEXT: @ %bb.2:
442 ; CHECK-NEXT: mov.w r12, #0
443 ; CHECK-NEXT: b .LBB2_4
444 ; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
445 ; CHECK-NEXT: add.w r7, r1, r3, lsl #2
446 ; CHECK-NEXT: add.w r6, r2, r3, lsl #2
447 ; CHECK-NEXT: cmp r7, r2
448 ; CHECK-NEXT: add.w r5, r0, r3, lsl #2
449 ; CHECK-NEXT: cset r7, hi
450 ; CHECK-NEXT: cmp r6, r1
451 ; CHECK-NEXT: csel r7, zr, r7, ls
452 ; CHECK-NEXT: cmp r6, r0
453 ; CHECK-NEXT: cset r6, hi
454 ; CHECK-NEXT: cmp r5, r2
455 ; CHECK-NEXT: cset r5, hi
456 ; CHECK-NEXT: mov.w r12, #0
457 ; CHECK-NEXT: tst r5, r6
459 ; CHECK-NEXT: cmpeq r7, #0
460 ; CHECK-NEXT: beq .LBB2_11
461 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
462 ; CHECK-NEXT: mvn.w r7, r12
463 ; CHECK-NEXT: adds r4, r7, r3
464 ; CHECK-NEXT: and r7, r3, #3
465 ; CHECK-NEXT: add.w r8, r12, r7
466 ; CHECK-NEXT: wls lr, r7, .LBB2_7
467 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
468 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
469 ; CHECK-NEXT: add.w r7, r1, r12, lsl #2
470 ; CHECK-NEXT: add.w r5, r2, r12, lsl #2
471 ; CHECK-NEXT: mov r12, r8
472 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol
473 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
474 ; CHECK-NEXT: vldmia r7!, {s0}
475 ; CHECK-NEXT: vldmia r6!, {s2}
476 ; CHECK-NEXT: vsub.f32 s0, s2, s0
477 ; CHECK-NEXT: vstmia r5!, {s0}
478 ; CHECK-NEXT: le lr, .LBB2_6
479 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
480 ; CHECK-NEXT: cmp r4, #3
481 ; CHECK-NEXT: blo .LBB2_10
482 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
483 ; CHECK-NEXT: sub.w r3, r8, r3
484 ; CHECK-NEXT: movs r7, #1
485 ; CHECK-NEXT: rsb r3, r3, r3, lsl #30
486 ; CHECK-NEXT: subs r3, #4
487 ; CHECK-NEXT: add.w lr, r7, r3, lsr #2
488 ; CHECK-NEXT: lsl.w r3, r12, #2
489 ; CHECK-NEXT: .LBB2_9: @ %for.body
490 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
491 ; CHECK-NEXT: adds r7, r1, r3
492 ; CHECK-NEXT: adds r6, r0, r3
493 ; CHECK-NEXT: adds r5, r2, r3
494 ; CHECK-NEXT: adds r0, #16
495 ; CHECK-NEXT: vldr s0, [r7]
496 ; CHECK-NEXT: adds r1, #16
497 ; CHECK-NEXT: vldr s2, [r6]
498 ; CHECK-NEXT: adds r2, #16
499 ; CHECK-NEXT: vsub.f32 s0, s2, s0
500 ; CHECK-NEXT: vstr s0, [r5]
501 ; CHECK-NEXT: vldr s0, [r7, #4]
502 ; CHECK-NEXT: vldr s2, [r6, #4]
503 ; CHECK-NEXT: vsub.f32 s0, s2, s0
504 ; CHECK-NEXT: vstr s0, [r5, #4]
505 ; CHECK-NEXT: vldr s0, [r7, #8]
506 ; CHECK-NEXT: vldr s2, [r6, #8]
507 ; CHECK-NEXT: vsub.f32 s0, s2, s0
508 ; CHECK-NEXT: vstr s0, [r5, #8]
509 ; CHECK-NEXT: vldr s0, [r7, #12]
510 ; CHECK-NEXT: vldr s2, [r6, #12]
511 ; CHECK-NEXT: vsub.f32 s0, s2, s0
512 ; CHECK-NEXT: vstr s0, [r5, #12]
513 ; CHECK-NEXT: le lr, .LBB2_9
514 ; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup
515 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
516 ; CHECK-NEXT: .LBB2_11: @ %vector.ph
517 ; CHECK-NEXT: bic r12, r3, #3
518 ; CHECK-NEXT: movs r6, #1
519 ; CHECK-NEXT: sub.w r7, r12, #4
520 ; CHECK-NEXT: mov r4, r0
521 ; CHECK-NEXT: mov r5, r1
522 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
523 ; CHECK-NEXT: mov r6, r2
524 ; CHECK-NEXT: .LBB2_12: @ %vector.body
525 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
526 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
527 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
528 ; CHECK-NEXT: vsub.f32 q0, q1, q0
529 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
530 ; CHECK-NEXT: le lr, .LBB2_12
531 ; CHECK-NEXT: @ %bb.13: @ %middle.block
532 ; CHECK-NEXT: cmp r12, r3
533 ; CHECK-NEXT: bne .LBB2_4
534 ; CHECK-NEXT: b .LBB2_10
536 %cmp8 = icmp eq i32 %N, 0
537 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
539 for.body.preheader: ; preds = %entry
540 %min.iters.check = icmp ult i32 %N, 4
541 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
543 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
544 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
545 %0 = xor i32 %i.09.ph, -1
547 %xtraiter = and i32 %N, 3
548 %lcmp.mod = icmp eq i32 %xtraiter, 0
549 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
551 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
552 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
553 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
554 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
555 %2 = load float, ptr %arrayidx.prol, align 4
556 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
557 %3 = load float, ptr %arrayidx1.prol, align 4
558 %sub.prol = fsub float %2, %3
559 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
560 store float %sub.prol, ptr %arrayidx2.prol, align 4
561 %inc.prol = add nuw i32 %i.09.prol, 1
562 %prol.iter.sub = add i32 %prol.iter, -1
563 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
564 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
566 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
567 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
568 %4 = icmp ult i32 %1, 3
569 br i1 %4, label %for.cond.cleanup, label %for.body
571 vector.memcheck: ; preds = %for.body.preheader
572 %scevgep = getelementptr float, ptr %c, i32 %N
573 %scevgep13 = getelementptr float, ptr %a, i32 %N
574 %scevgep16 = getelementptr float, ptr %b, i32 %N
575 %bound0 = icmp ugt ptr %scevgep13, %c
576 %bound1 = icmp ugt ptr %scevgep, %a
577 %found.conflict = and i1 %bound0, %bound1
578 %bound018 = icmp ugt ptr %scevgep16, %c
579 %bound119 = icmp ugt ptr %scevgep, %b
580 %found.conflict20 = and i1 %bound018, %bound119
581 %conflict.rdx = or i1 %found.conflict, %found.conflict20
582 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
584 vector.ph: ; preds = %vector.memcheck
585 %n.vec = and i32 %N, -4
586 br label %vector.body
588 vector.body: ; preds = %vector.body, %vector.ph
589 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
590 %5 = getelementptr inbounds float, ptr %a, i32 %index
591 %wide.load = load <4 x float>, ptr %5, align 4
592 %6 = getelementptr inbounds float, ptr %b, i32 %index
593 %wide.load21 = load <4 x float>, ptr %6, align 4
594 %7 = fsub <4 x float> %wide.load, %wide.load21
595 %8 = getelementptr inbounds float, ptr %c, i32 %index
596 store <4 x float> %7, ptr %8, align 4
597 %index.next = add i32 %index, 4
598 %9 = icmp eq i32 %index.next, %n.vec
599 br i1 %9, label %middle.block, label %vector.body
601 middle.block: ; preds = %vector.body
602 %cmp.n = icmp eq i32 %n.vec, %N
603 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
605 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
608 for.body: ; preds = %for.body.prol.loopexit, %for.body
609 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
610 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
611 %10 = load float, ptr %arrayidx, align 4
612 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
613 %11 = load float, ptr %arrayidx1, align 4
614 %sub = fsub float %10, %11
615 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
616 store float %sub, ptr %arrayidx2, align 4
617 %inc = add nuw i32 %i.09, 1
618 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
619 %12 = load float, ptr %arrayidx.1, align 4
620 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
621 %13 = load float, ptr %arrayidx1.1, align 4
622 %sub.1 = fsub float %12, %13
623 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
624 store float %sub.1, ptr %arrayidx2.1, align 4
625 %inc.1 = add nuw i32 %i.09, 2
626 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
627 %14 = load float, ptr %arrayidx.2, align 4
628 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
629 %15 = load float, ptr %arrayidx1.2, align 4
630 %sub.2 = fsub float %14, %15
631 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
632 store float %sub.2, ptr %arrayidx2.2, align 4
633 %inc.2 = add nuw i32 %i.09, 3
634 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
635 %16 = load float, ptr %arrayidx.3, align 4
636 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
637 %17 = load float, ptr %arrayidx1.3, align 4
638 %sub.3 = fsub float %16, %17
639 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
640 store float %sub.3, ptr %arrayidx2.3, align 4
641 %inc.3 = add nuw i32 %i.09, 4
642 %exitcond.3 = icmp eq i32 %inc.3, %N
643 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
646 define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
647 ; CHECK-LABEL: float_int_mul:
648 ; CHECK: @ %bb.0: @ %entry
649 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
650 ; CHECK-NEXT: cmp r3, #0
651 ; CHECK-NEXT: beq.w .LBB3_13
652 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
653 ; CHECK-NEXT: cmp r3, #3
654 ; CHECK-NEXT: bls .LBB3_6
655 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
656 ; CHECK-NEXT: add.w r7, r0, r3, lsl #2
657 ; CHECK-NEXT: cmp r7, r2
659 ; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
660 ; CHECK-NEXT: cmphi r7, r0
661 ; CHECK-NEXT: bhi .LBB3_6
662 ; CHECK-NEXT: @ %bb.3: @ %vector.ph
663 ; CHECK-NEXT: bic r12, r3, #3
664 ; CHECK-NEXT: movs r6, #1
665 ; CHECK-NEXT: sub.w r7, r12, #4
666 ; CHECK-NEXT: mov r4, r0
667 ; CHECK-NEXT: mov r5, r1
668 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
669 ; CHECK-NEXT: mov r6, r2
670 ; CHECK-NEXT: .LBB3_4: @ %vector.body
671 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
672 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
673 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
674 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
675 ; CHECK-NEXT: vmul.f32 q0, q1, q0
676 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
677 ; CHECK-NEXT: le lr, .LBB3_4
678 ; CHECK-NEXT: @ %bb.5: @ %middle.block
679 ; CHECK-NEXT: cmp r12, r3
680 ; CHECK-NEXT: bne .LBB3_7
681 ; CHECK-NEXT: b .LBB3_13
682 ; CHECK-NEXT: .LBB3_6:
683 ; CHECK-NEXT: mov.w r12, #0
684 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
685 ; CHECK-NEXT: mvn.w r7, r12
686 ; CHECK-NEXT: add.w r9, r7, r3
687 ; CHECK-NEXT: and r7, r3, #3
688 ; CHECK-NEXT: add.w r8, r12, r7
689 ; CHECK-NEXT: wls lr, r7, .LBB3_10
690 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
691 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
692 ; CHECK-NEXT: add.w r7, r1, r12, lsl #2
693 ; CHECK-NEXT: add.w r5, r2, r12, lsl #2
694 ; CHECK-NEXT: mov r12, r8
695 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol
696 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
697 ; CHECK-NEXT: ldr r4, [r7], #4
698 ; CHECK-NEXT: vldmia r6!, {s2}
699 ; CHECK-NEXT: vmov s0, r4
700 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
701 ; CHECK-NEXT: vmul.f32 s0, s2, s0
702 ; CHECK-NEXT: vstmia r5!, {s0}
703 ; CHECK-NEXT: le lr, .LBB3_9
704 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
705 ; CHECK-NEXT: cmp.w r9, #3
706 ; CHECK-NEXT: blo .LBB3_13
707 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
708 ; CHECK-NEXT: sub.w r3, r8, r3
709 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
710 ; CHECK-NEXT: movs r7, #1
711 ; CHECK-NEXT: adds r1, #8
712 ; CHECK-NEXT: rsb r3, r3, r3, lsl #30
713 ; CHECK-NEXT: subs r3, #4
714 ; CHECK-NEXT: add.w lr, r7, r3, lsr #2
715 ; CHECK-NEXT: lsl.w r3, r12, #2
716 ; CHECK-NEXT: .LBB3_12: @ %for.body
717 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
718 ; CHECK-NEXT: vldr s0, [r1, #-8]
719 ; CHECK-NEXT: adds r7, r0, r3
720 ; CHECK-NEXT: adds r6, r2, r3
721 ; CHECK-NEXT: adds r0, #16
722 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
723 ; CHECK-NEXT: vldr s2, [r7]
724 ; CHECK-NEXT: adds r2, #16
725 ; CHECK-NEXT: vmul.f32 s0, s2, s0
726 ; CHECK-NEXT: vstr s0, [r6]
727 ; CHECK-NEXT: vldr s0, [r1, #-4]
728 ; CHECK-NEXT: vldr s2, [r7, #4]
729 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
730 ; CHECK-NEXT: vmul.f32 s0, s2, s0
731 ; CHECK-NEXT: vstr s0, [r6, #4]
732 ; CHECK-NEXT: vldr s0, [r1]
733 ; CHECK-NEXT: vldr s2, [r7, #8]
734 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
735 ; CHECK-NEXT: vmul.f32 s0, s2, s0
736 ; CHECK-NEXT: vstr s0, [r6, #8]
737 ; CHECK-NEXT: vldr s0, [r1, #4]
738 ; CHECK-NEXT: adds r1, #16
739 ; CHECK-NEXT: vldr s2, [r7, #12]
740 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
741 ; CHECK-NEXT: vmul.f32 s0, s2, s0
742 ; CHECK-NEXT: vstr s0, [r6, #12]
743 ; CHECK-NEXT: le lr, .LBB3_12
744 ; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup
745 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
747 %cmp8 = icmp eq i32 %N, 0
748 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
750 for.body.preheader: ; preds = %entry
751 %min.iters.check = icmp ult i32 %N, 4
752 br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
754 for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
755 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
756 %0 = xor i32 %i.09.ph, -1
758 %xtraiter = and i32 %N, 3
759 %lcmp.mod = icmp eq i32 %xtraiter, 0
760 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
762 for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
763 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
764 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
765 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
766 %2 = load float, ptr %arrayidx.prol, align 4
767 %arrayidx1.prol = getelementptr inbounds i32, ptr %b, i32 %i.09.prol
768 %3 = load i32, ptr %arrayidx1.prol, align 4
769 %conv.prol = sitofp i32 %3 to float
770 %mul.prol = fmul float %2, %conv.prol
771 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
772 store float %mul.prol, ptr %arrayidx2.prol, align 4
773 %inc.prol = add nuw i32 %i.09.prol, 1
774 %prol.iter.sub = add i32 %prol.iter, -1
775 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
776 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
778 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
779 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
780 %4 = icmp ult i32 %1, 3
781 br i1 %4, label %for.cond.cleanup, label %for.body
783 vector.memcheck: ; preds = %for.body.preheader
784 %scevgep = getelementptr float, ptr %c, i32 %N
785 %scevgep13 = getelementptr float, ptr %a, i32 %N
786 %bound0 = icmp ugt ptr %scevgep13, %c
787 %bound1 = icmp ugt ptr %scevgep, %a
788 %found.conflict = and i1 %bound0, %bound1
789 br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
791 vector.ph: ; preds = %vector.memcheck
792 %n.vec = and i32 %N, -4
793 br label %vector.body
795 vector.body: ; preds = %vector.body, %vector.ph
796 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
797 %5 = getelementptr inbounds float, ptr %a, i32 %index
798 %wide.load = load <4 x float>, ptr %5, align 4
799 %6 = getelementptr inbounds i32, ptr %b, i32 %index
800 %wide.load15 = load <4 x i32>, ptr %6, align 4
801 %7 = sitofp <4 x i32> %wide.load15 to <4 x float>
802 %8 = fmul <4 x float> %wide.load, %7
803 %9 = getelementptr inbounds float, ptr %c, i32 %index
804 store <4 x float> %8, ptr %9, align 4
805 %index.next = add i32 %index, 4
806 %10 = icmp eq i32 %index.next, %n.vec
807 br i1 %10, label %middle.block, label %vector.body
809 middle.block: ; preds = %vector.body
810 %cmp.n = icmp eq i32 %n.vec, %N
811 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
813 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
816 for.body: ; preds = %for.body.prol.loopexit, %for.body
817 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
818 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
819 %11 = load float, ptr %arrayidx, align 4
820 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
821 %12 = load i32, ptr %arrayidx1, align 4
822 %conv = sitofp i32 %12 to float
823 %mul = fmul float %11, %conv
824 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
825 store float %mul, ptr %arrayidx2, align 4
826 %inc = add nuw i32 %i.09, 1
827 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
828 %13 = load float, ptr %arrayidx.1, align 4
829 %arrayidx1.1 = getelementptr inbounds i32, ptr %b, i32 %inc
830 %14 = load i32, ptr %arrayidx1.1, align 4
831 %conv.1 = sitofp i32 %14 to float
832 %mul.1 = fmul float %13, %conv.1
833 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
834 store float %mul.1, ptr %arrayidx2.1, align 4
835 %inc.1 = add nuw i32 %i.09, 2
836 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
837 %15 = load float, ptr %arrayidx.2, align 4
838 %arrayidx1.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1
839 %16 = load i32, ptr %arrayidx1.2, align 4
840 %conv.2 = sitofp i32 %16 to float
841 %mul.2 = fmul float %15, %conv.2
842 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
843 store float %mul.2, ptr %arrayidx2.2, align 4
844 %inc.2 = add nuw i32 %i.09, 3
845 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
846 %17 = load float, ptr %arrayidx.3, align 4
847 %arrayidx1.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2
848 %18 = load i32, ptr %arrayidx1.3, align 4
849 %conv.3 = sitofp i32 %18 to float
850 %mul.3 = fmul float %17, %conv.3
851 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
852 store float %mul.3, ptr %arrayidx2.3, align 4
853 %inc.3 = add nuw i32 %i.09, 4
854 %exitcond.3 = icmp eq i32 %inc.3, %N
855 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
858 define arm_aapcs_vfpcc void @float_int_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
859 ; CHECK-LABEL: float_int_int_mul:
860 ; CHECK: @ %bb.0: @ %entry
861 ; CHECK-NEXT: push {r4, r5, r6, lr}
862 ; CHECK-NEXT: cbz r3, .LBB4_8
863 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
864 ; CHECK-NEXT: cmp r3, #3
865 ; CHECK-NEXT: bhi .LBB4_3
866 ; CHECK-NEXT: @ %bb.2:
867 ; CHECK-NEXT: mov.w r12, #0
868 ; CHECK-NEXT: b .LBB4_6
869 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
870 ; CHECK-NEXT: bic r12, r3, #3
871 ; CHECK-NEXT: movs r5, #1
872 ; CHECK-NEXT: sub.w r6, r12, #4
873 ; CHECK-NEXT: mov r4, r0
874 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
875 ; CHECK-NEXT: mov r5, r1
876 ; CHECK-NEXT: mov r6, r2
877 ; CHECK-NEXT: .LBB4_4: @ %vector.body
878 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
879 ; CHECK-NEXT: vldrw.u32 q0, [r4], #16
880 ; CHECK-NEXT: vldrw.u32 q1, [r5], #16
881 ; CHECK-NEXT: vmul.i32 q0, q1, q0
882 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
883 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
884 ; CHECK-NEXT: le lr, .LBB4_4
885 ; CHECK-NEXT: @ %bb.5: @ %middle.block
886 ; CHECK-NEXT: cmp r12, r3
888 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
889 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
890 ; CHECK-NEXT: sub.w lr, r3, r12
891 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2
892 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
893 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
894 ; CHECK-NEXT: .LBB4_7: @ %for.body
895 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
896 ; CHECK-NEXT: ldr r3, [r0], #4
897 ; CHECK-NEXT: ldr r6, [r1], #4
898 ; CHECK-NEXT: muls r3, r6, r3
899 ; CHECK-NEXT: vmov s0, r3
900 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
901 ; CHECK-NEXT: vstmia r2!, {s0}
902 ; CHECK-NEXT: le lr, .LBB4_7
903 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
904 ; CHECK-NEXT: pop {r4, r5, r6, pc}
906 %cmp8 = icmp eq i32 %N, 0
907 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
909 for.body.preheader: ; preds = %entry
910 %min.iters.check = icmp ult i32 %N, 4
911 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
913 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
914 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
917 vector.ph: ; preds = %for.body.preheader
918 %n.vec = and i32 %N, -4
919 br label %vector.body
921 vector.body: ; preds = %vector.body, %vector.ph
922 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
923 %0 = getelementptr inbounds i32, ptr %a, i32 %index
924 %wide.load = load <4 x i32>, ptr %0, align 4
925 %1 = getelementptr inbounds i32, ptr %b, i32 %index
926 %wide.load10 = load <4 x i32>, ptr %1, align 4
927 %2 = mul nsw <4 x i32> %wide.load10, %wide.load
928 %3 = sitofp <4 x i32> %2 to <4 x float>
929 %4 = getelementptr inbounds float, ptr %c, i32 %index
930 store <4 x float> %3, ptr %4, align 4
931 %index.next = add i32 %index, 4
932 %5 = icmp eq i32 %index.next, %n.vec
933 br i1 %5, label %middle.block, label %vector.body
935 middle.block: ; preds = %vector.body
936 %cmp.n = icmp eq i32 %n.vec, %N
937 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
939 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
942 for.body: ; preds = %for.body.preheader11, %for.body
943 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
944 %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.09
945 %6 = load i32, ptr %arrayidx, align 4
946 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
947 %7 = load i32, ptr %arrayidx1, align 4
948 %mul = mul nsw i32 %7, %6
949 %conv = sitofp i32 %mul to float
950 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
951 store float %conv, ptr %arrayidx2, align 4
952 %inc = add nuw i32 %i.09, 1
953 %exitcond = icmp eq i32 %inc, %N
954 br i1 %exitcond, label %for.cond.cleanup, label %for.body
957 define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
958 ; CHECK-LABEL: half_half_mul:
959 ; CHECK: @ %bb.0: @ %entry
960 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
961 ; CHECK-NEXT: cmp r3, #0
962 ; CHECK-NEXT: beq .LBB5_8
963 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
964 ; CHECK-NEXT: cmp r3, #3
965 ; CHECK-NEXT: bhi .LBB5_3
966 ; CHECK-NEXT: @ %bb.2:
967 ; CHECK-NEXT: mov.w r12, #0
968 ; CHECK-NEXT: b .LBB5_6
969 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
970 ; CHECK-NEXT: bic r12, r3, #3
971 ; CHECK-NEXT: movs r5, #1
972 ; CHECK-NEXT: sub.w r6, r12, #4
973 ; CHECK-NEXT: mov r4, r0
974 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
975 ; CHECK-NEXT: mov r5, r1
976 ; CHECK-NEXT: mov r6, r2
977 ; CHECK-NEXT: .LBB5_4: @ %vector.body
978 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
979 ; CHECK-NEXT: ldr.w r9, [r4]
980 ; CHECK-NEXT: ldr r7, [r5]
981 ; CHECK-NEXT: ldr.w r8, [r4, #4]
982 ; CHECK-NEXT: vmov.32 q0[0], r9
983 ; CHECK-NEXT: ldr.w r10, [r5, #4]
984 ; CHECK-NEXT: vmov.32 q1[0], r7
985 ; CHECK-NEXT: vmov.32 q0[1], r8
986 ; CHECK-NEXT: adds r4, #8
987 ; CHECK-NEXT: vmov.32 q1[1], r10
988 ; CHECK-NEXT: adds r5, #8
989 ; CHECK-NEXT: vmul.f16 q0, q0, q1
990 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
991 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
992 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
993 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
994 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
995 ; CHECK-NEXT: le lr, .LBB5_4
996 ; CHECK-NEXT: @ %bb.5: @ %middle.block
997 ; CHECK-NEXT: cmp r12, r3
998 ; CHECK-NEXT: beq .LBB5_8
999 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
1000 ; CHECK-NEXT: sub.w lr, r3, r12
1001 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1002 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1003 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1004 ; CHECK-NEXT: .LBB5_7: @ %for.body
1005 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1006 ; CHECK-NEXT: vldr.16 s0, [r1]
1007 ; CHECK-NEXT: vldr.16 s2, [r0]
1008 ; CHECK-NEXT: adds r0, #2
1009 ; CHECK-NEXT: adds r1, #2
1010 ; CHECK-NEXT: vmul.f16 s0, s2, s0
1011 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1012 ; CHECK-NEXT: vstmia r2!, {s0}
1013 ; CHECK-NEXT: le lr, .LBB5_7
1014 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
1015 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1017 %cmp8 = icmp eq i32 %N, 0
1018 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1020 for.body.preheader: ; preds = %entry
1021 %min.iters.check = icmp ult i32 %N, 4
1022 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1024 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1025 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1028 vector.ph: ; preds = %for.body.preheader
1029 %n.vec = and i32 %N, -4
1030 br label %vector.body
1032 vector.body: ; preds = %vector.body, %vector.ph
1033 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1034 %0 = getelementptr inbounds half, ptr %a, i32 %index
1035 %wide.load = load <4 x half>, ptr %0, align 2
1036 %1 = getelementptr inbounds half, ptr %b, i32 %index
1037 %wide.load10 = load <4 x half>, ptr %1, align 2
1038 %2 = fmul <4 x half> %wide.load, %wide.load10
1039 %3 = fpext <4 x half> %2 to <4 x float>
1040 %4 = getelementptr inbounds float, ptr %c, i32 %index
1041 store <4 x float> %3, ptr %4, align 4
1042 %index.next = add i32 %index, 4
1043 %5 = icmp eq i32 %index.next, %n.vec
1044 br i1 %5, label %middle.block, label %vector.body
1046 middle.block: ; preds = %vector.body
1047 %cmp.n = icmp eq i32 %n.vec, %N
1048 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1050 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1053 for.body: ; preds = %for.body.preheader11, %for.body
1054 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1055 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1056 %6 = load half, ptr %arrayidx, align 2
1057 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1058 %7 = load half, ptr %arrayidx1, align 2
1059 %mul = fmul half %6, %7
1060 %conv = fpext half %mul to float
1061 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1062 store float %conv, ptr %arrayidx2, align 4
1063 %inc = add nuw i32 %i.09, 1
1064 %exitcond = icmp eq i32 %inc, %N
1065 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1068 define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1069 ; CHECK-LABEL: half_half_add:
1070 ; CHECK: @ %bb.0: @ %entry
1071 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1072 ; CHECK-NEXT: cmp r3, #0
1073 ; CHECK-NEXT: beq .LBB6_8
1074 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1075 ; CHECK-NEXT: cmp r3, #3
1076 ; CHECK-NEXT: bhi .LBB6_3
1077 ; CHECK-NEXT: @ %bb.2:
1078 ; CHECK-NEXT: mov.w r12, #0
1079 ; CHECK-NEXT: b .LBB6_6
1080 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1081 ; CHECK-NEXT: bic r12, r3, #3
1082 ; CHECK-NEXT: movs r5, #1
1083 ; CHECK-NEXT: sub.w r6, r12, #4
1084 ; CHECK-NEXT: mov r4, r0
1085 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1086 ; CHECK-NEXT: mov r5, r1
1087 ; CHECK-NEXT: mov r6, r2
1088 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1089 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1090 ; CHECK-NEXT: ldr.w r9, [r4]
1091 ; CHECK-NEXT: ldr r7, [r5]
1092 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1093 ; CHECK-NEXT: vmov.32 q0[0], r9
1094 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1095 ; CHECK-NEXT: vmov.32 q1[0], r7
1096 ; CHECK-NEXT: vmov.32 q0[1], r8
1097 ; CHECK-NEXT: adds r4, #8
1098 ; CHECK-NEXT: vmov.32 q1[1], r10
1099 ; CHECK-NEXT: adds r5, #8
1100 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1101 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1102 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1103 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1104 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1105 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1106 ; CHECK-NEXT: le lr, .LBB6_4
1107 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1108 ; CHECK-NEXT: cmp r12, r3
1109 ; CHECK-NEXT: beq .LBB6_8
1110 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
1111 ; CHECK-NEXT: sub.w lr, r3, r12
1112 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1113 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1114 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1115 ; CHECK-NEXT: .LBB6_7: @ %for.body
1116 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1117 ; CHECK-NEXT: vldr.16 s0, [r1]
1118 ; CHECK-NEXT: vldr.16 s2, [r0]
1119 ; CHECK-NEXT: adds r0, #2
1120 ; CHECK-NEXT: adds r1, #2
1121 ; CHECK-NEXT: vadd.f16 s0, s2, s0
1122 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1123 ; CHECK-NEXT: vstmia r2!, {s0}
1124 ; CHECK-NEXT: le lr, .LBB6_7
1125 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1126 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1128 %cmp8 = icmp eq i32 %N, 0
1129 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1131 for.body.preheader: ; preds = %entry
1132 %min.iters.check = icmp ult i32 %N, 4
1133 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1135 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1136 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1139 vector.ph: ; preds = %for.body.preheader
1140 %n.vec = and i32 %N, -4
1141 br label %vector.body
1143 vector.body: ; preds = %vector.body, %vector.ph
1144 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1145 %0 = getelementptr inbounds half, ptr %a, i32 %index
1146 %wide.load = load <4 x half>, ptr %0, align 2
1147 %1 = getelementptr inbounds half, ptr %b, i32 %index
1148 %wide.load10 = load <4 x half>, ptr %1, align 2
1149 %2 = fadd <4 x half> %wide.load, %wide.load10
1150 %3 = fpext <4 x half> %2 to <4 x float>
1151 %4 = getelementptr inbounds float, ptr %c, i32 %index
1152 store <4 x float> %3, ptr %4, align 4
1153 %index.next = add i32 %index, 4
1154 %5 = icmp eq i32 %index.next, %n.vec
1155 br i1 %5, label %middle.block, label %vector.body
1157 middle.block: ; preds = %vector.body
1158 %cmp.n = icmp eq i32 %n.vec, %N
1159 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1161 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1164 for.body: ; preds = %for.body.preheader11, %for.body
1165 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1166 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1167 %6 = load half, ptr %arrayidx, align 2
1168 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1169 %7 = load half, ptr %arrayidx1, align 2
1170 %add = fadd half %6, %7
1171 %conv = fpext half %add to float
1172 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1173 store float %conv, ptr %arrayidx2, align 4
1174 %inc = add nuw i32 %i.09, 1
1175 %exitcond = icmp eq i32 %inc, %N
1176 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1179 define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1180 ; CHECK-LABEL: half_half_sub:
1181 ; CHECK: @ %bb.0: @ %entry
1182 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1183 ; CHECK-NEXT: cmp r3, #0
1184 ; CHECK-NEXT: beq .LBB7_8
1185 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1186 ; CHECK-NEXT: cmp r3, #3
1187 ; CHECK-NEXT: bhi .LBB7_3
1188 ; CHECK-NEXT: @ %bb.2:
1189 ; CHECK-NEXT: mov.w r12, #0
1190 ; CHECK-NEXT: b .LBB7_6
1191 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1192 ; CHECK-NEXT: bic r12, r3, #3
1193 ; CHECK-NEXT: movs r5, #1
1194 ; CHECK-NEXT: sub.w r6, r12, #4
1195 ; CHECK-NEXT: mov r4, r0
1196 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1197 ; CHECK-NEXT: mov r5, r1
1198 ; CHECK-NEXT: mov r6, r2
1199 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1200 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1201 ; CHECK-NEXT: ldr.w r9, [r4]
1202 ; CHECK-NEXT: ldr r7, [r5]
1203 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1204 ; CHECK-NEXT: vmov.32 q0[0], r9
1205 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1206 ; CHECK-NEXT: vmov.32 q1[0], r7
1207 ; CHECK-NEXT: vmov.32 q0[1], r8
1208 ; CHECK-NEXT: adds r4, #8
1209 ; CHECK-NEXT: vmov.32 q1[1], r10
1210 ; CHECK-NEXT: adds r5, #8
1211 ; CHECK-NEXT: vsub.f16 q0, q0, q1
1212 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1213 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1214 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1215 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1216 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1217 ; CHECK-NEXT: le lr, .LBB7_4
1218 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1219 ; CHECK-NEXT: cmp r12, r3
1220 ; CHECK-NEXT: beq .LBB7_8
1221 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
1222 ; CHECK-NEXT: sub.w lr, r3, r12
1223 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1224 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1225 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1226 ; CHECK-NEXT: .LBB7_7: @ %for.body
1227 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1228 ; CHECK-NEXT: vldr.16 s0, [r1]
1229 ; CHECK-NEXT: vldr.16 s2, [r0]
1230 ; CHECK-NEXT: adds r0, #2
1231 ; CHECK-NEXT: adds r1, #2
1232 ; CHECK-NEXT: vsub.f16 s0, s2, s0
1233 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1234 ; CHECK-NEXT: vstmia r2!, {s0}
1235 ; CHECK-NEXT: le lr, .LBB7_7
1236 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1237 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1239 %cmp8 = icmp eq i32 %N, 0
1240 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1242 for.body.preheader: ; preds = %entry
1243 %min.iters.check = icmp ult i32 %N, 4
1244 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1246 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1247 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1250 vector.ph: ; preds = %for.body.preheader
1251 %n.vec = and i32 %N, -4
1252 br label %vector.body
1254 vector.body: ; preds = %vector.body, %vector.ph
1255 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1256 %0 = getelementptr inbounds half, ptr %a, i32 %index
1257 %wide.load = load <4 x half>, ptr %0, align 2
1258 %1 = getelementptr inbounds half, ptr %b, i32 %index
1259 %wide.load10 = load <4 x half>, ptr %1, align 2
1260 %2 = fsub <4 x half> %wide.load, %wide.load10
1261 %3 = fpext <4 x half> %2 to <4 x float>
1262 %4 = getelementptr inbounds float, ptr %c, i32 %index
1263 store <4 x float> %3, ptr %4, align 4
1264 %index.next = add i32 %index, 4
1265 %5 = icmp eq i32 %index.next, %n.vec
1266 br i1 %5, label %middle.block, label %vector.body
1268 middle.block: ; preds = %vector.body
1269 %cmp.n = icmp eq i32 %n.vec, %N
1270 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1272 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1275 for.body: ; preds = %for.body.preheader11, %for.body
1276 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1277 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1278 %6 = load half, ptr %arrayidx, align 2
1279 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1280 %7 = load half, ptr %arrayidx1, align 2
1281 %sub = fsub half %6, %7
1282 %conv = fpext half %sub to float
1283 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1284 store float %conv, ptr %arrayidx2, align 4
1285 %inc = add nuw i32 %i.09, 1
1286 %exitcond = icmp eq i32 %inc, %N
1287 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1290 define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1291 ; CHECK-LABEL: half_short_mul:
1292 ; CHECK: @ %bb.0: @ %entry
1293 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
1294 ; CHECK-NEXT: sub sp, #16
1295 ; CHECK-NEXT: cmp r3, #0
1296 ; CHECK-NEXT: beq .LBB8_8
1297 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1298 ; CHECK-NEXT: mov r8, r2
1299 ; CHECK-NEXT: mov r9, r1
1300 ; CHECK-NEXT: cmp r3, #3
1301 ; CHECK-NEXT: bhi .LBB8_3
1302 ; CHECK-NEXT: @ %bb.2:
1303 ; CHECK-NEXT: mov.w r12, #0
1304 ; CHECK-NEXT: b .LBB8_6
1305 ; CHECK-NEXT: .LBB8_3: @ %vector.ph
1306 ; CHECK-NEXT: bic r12, r3, #3
1307 ; CHECK-NEXT: movs r6, #1
1308 ; CHECK-NEXT: sub.w r7, r12, #4
1309 ; CHECK-NEXT: mov r1, sp
1310 ; CHECK-NEXT: mov r5, r0
1311 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
1312 ; CHECK-NEXT: mov r6, r9
1313 ; CHECK-NEXT: mov r7, r8
1314 ; CHECK-NEXT: .LBB8_4: @ %vector.body
1315 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1316 ; CHECK-NEXT: vldrh.u32 q0, [r6], #8
1317 ; CHECK-NEXT: ldr r4, [r5]
1318 ; CHECK-NEXT: ldr r2, [r5, #4]
1319 ; CHECK-NEXT: adds r5, #8
1320 ; CHECK-NEXT: vstrh.32 q0, [r1]
1321 ; CHECK-NEXT: vmov.32 q1[0], r4
1322 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1323 ; CHECK-NEXT: vmov.32 q1[1], r2
1324 ; CHECK-NEXT: vcvt.f16.s16 q0, q0
1325 ; CHECK-NEXT: vmul.f16 q0, q1, q0
1326 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1327 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1328 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1329 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1330 ; CHECK-NEXT: vstrb.8 q0, [r7], #16
1331 ; CHECK-NEXT: le lr, .LBB8_4
1332 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1333 ; CHECK-NEXT: cmp r12, r3
1334 ; CHECK-NEXT: beq .LBB8_8
1335 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
1336 ; CHECK-NEXT: sub.w lr, r3, r12
1337 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1338 ; CHECK-NEXT: add.w r1, r9, r12, lsl #1
1339 ; CHECK-NEXT: add.w r2, r8, r12, lsl #2
1340 ; CHECK-NEXT: .LBB8_7: @ %for.body
1341 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1342 ; CHECK-NEXT: ldrsh r3, [r1], #2
1343 ; CHECK-NEXT: vldr.16 s0, [r0]
1344 ; CHECK-NEXT: adds r0, #2
1345 ; CHECK-NEXT: vmov s2, r3
1346 ; CHECK-NEXT: vcvt.f16.s32 s2, s2
1347 ; CHECK-NEXT: vmul.f16 s0, s0, s2
1348 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1349 ; CHECK-NEXT: vstmia r2!, {s0}
1350 ; CHECK-NEXT: le lr, .LBB8_7
1351 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
1352 ; CHECK-NEXT: add sp, #16
1353 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
1355 %cmp10 = icmp eq i32 %N, 0
1356 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1358 for.body.preheader: ; preds = %entry
1359 %min.iters.check = icmp ult i32 %N, 4
1360 br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
1362 for.body.preheader13: ; preds = %middle.block, %for.body.preheader
1363 %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1366 vector.ph: ; preds = %for.body.preheader
1367 %n.vec = and i32 %N, -4
1368 br label %vector.body
1370 vector.body: ; preds = %vector.body, %vector.ph
1371 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1372 %0 = getelementptr inbounds half, ptr %a, i32 %index
1373 %wide.load = load <4 x half>, ptr %0, align 2
1374 %1 = getelementptr inbounds i16, ptr %b, i32 %index
1375 %wide.load12 = load <4 x i16>, ptr %1, align 2
1376 %2 = sitofp <4 x i16> %wide.load12 to <4 x half>
1377 %3 = fmul <4 x half> %wide.load, %2
1378 %4 = fpext <4 x half> %3 to <4 x float>
1379 %5 = getelementptr inbounds float, ptr %c, i32 %index
1380 store <4 x float> %4, ptr %5, align 4
1381 %index.next = add i32 %index, 4
1382 %6 = icmp eq i32 %index.next, %n.vec
1383 br i1 %6, label %middle.block, label %vector.body
1385 middle.block: ; preds = %vector.body
1386 %cmp.n = icmp eq i32 %n.vec, %N
1387 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
1389 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1392 for.body: ; preds = %for.body.preheader13, %for.body
1393 %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
1394 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1395 %7 = load half, ptr %arrayidx, align 2
1396 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.011
1397 %8 = load i16, ptr %arrayidx1, align 2
1398 %conv2 = sitofp i16 %8 to half
1399 %mul = fmul half %7, %conv2
1400 %conv3 = fpext half %mul to float
1401 %arrayidx4 = getelementptr inbounds float, ptr %c, i32 %i.011
1402 store float %conv3, ptr %arrayidx4, align 4
1403 %inc = add nuw i32 %i.011, 1
1404 %exitcond = icmp eq i32 %inc, %N
1405 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1408 define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1409 ; CHECK-LABEL: half_half_mac:
1410 ; CHECK: @ %bb.0: @ %entry
1411 ; CHECK-NEXT: push {r4, r5, r7, lr}
1412 ; CHECK-NEXT: cbz r2, .LBB9_3
1413 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1414 ; CHECK-NEXT: and r12, r2, #3
1415 ; CHECK-NEXT: subs r3, r2, #1
1416 ; CHECK-NEXT: cmp r3, #3
1417 ; CHECK-NEXT: bhs .LBB9_4
1418 ; CHECK-NEXT: @ %bb.2:
1419 ; CHECK-NEXT: vldr s0, .LCPI9_0
1420 ; CHECK-NEXT: movs r2, #0
1421 ; CHECK-NEXT: b .LBB9_6
1422 ; CHECK-NEXT: .LBB9_3:
1423 ; CHECK-NEXT: vldr s0, .LCPI9_0
1424 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1425 ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
1426 ; CHECK-NEXT: bic r2, r2, #3
1427 ; CHECK-NEXT: movs r3, #1
1428 ; CHECK-NEXT: subs r2, #4
1429 ; CHECK-NEXT: vldr s0, .LCPI9_0
1430 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1431 ; CHECK-NEXT: movs r3, #0
1432 ; CHECK-NEXT: movs r2, #0
1433 ; CHECK-NEXT: .LBB9_5: @ %for.body
1434 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1435 ; CHECK-NEXT: adds r5, r0, r3
1436 ; CHECK-NEXT: adds r4, r1, r3
1437 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1438 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1439 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1440 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1441 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1442 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1443 ; CHECK-NEXT: vldr.16 s10, [r5]
1444 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1445 ; CHECK-NEXT: vmul.f16 s4, s6, s4
1446 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1447 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1448 ; CHECK-NEXT: adds r3, #8
1449 ; CHECK-NEXT: vmul.f16 s6, s8, s6
1450 ; CHECK-NEXT: vldr.16 s8, [r4]
1451 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1452 ; CHECK-NEXT: adds r2, #4
1453 ; CHECK-NEXT: vmul.f16 s8, s10, s8
1454 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1455 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1456 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1457 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1458 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1459 ; CHECK-NEXT: le lr, .LBB9_5
1460 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1461 ; CHECK-NEXT: wls lr, r12, .LBB9_9
1462 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1463 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1464 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1465 ; CHECK-NEXT: .LBB9_8: @ %for.body.epil
1466 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1467 ; CHECK-NEXT: vldr.16 s2, [r1]
1468 ; CHECK-NEXT: vldr.16 s4, [r0]
1469 ; CHECK-NEXT: adds r0, #2
1470 ; CHECK-NEXT: adds r1, #2
1471 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1472 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1473 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1474 ; CHECK-NEXT: le lr, .LBB9_8
1475 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
1476 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1477 ; CHECK-NEXT: .p2align 2
1478 ; CHECK-NEXT: @ %bb.10:
1479 ; CHECK-NEXT: .LCPI9_0:
1480 ; CHECK-NEXT: .long 0x00000000 @ float 0
1482 %cmp8 = icmp eq i32 %N, 0
1483 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1485 for.body.preheader: ; preds = %entry
1487 %xtraiter = and i32 %N, 3
1488 %1 = icmp ult i32 %0, 3
1489 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1491 for.body.preheader.new: ; preds = %for.body.preheader
1492 %unroll_iter = sub i32 %N, %xtraiter
1495 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1496 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1497 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1498 %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1499 %lcmp.mod = icmp eq i32 %xtraiter, 0
1500 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1502 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1503 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1504 %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1505 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1506 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.010.epil
1507 %2 = load half, ptr %arrayidx.epil, align 2
1508 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.010.epil
1509 %3 = load half, ptr %arrayidx1.epil, align 2
1510 %mul.epil = fmul half %2, %3
1511 %conv.epil = fpext half %mul.epil to float
1512 %add.epil = fadd float %res.09.epil, %conv.epil
1513 %inc.epil = add nuw i32 %i.010.epil, 1
1514 %epil.iter.sub = add i32 %epil.iter, -1
1515 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1516 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1518 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1519 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1520 ret float %res.0.lcssa
1522 for.body: ; preds = %for.body, %for.body.preheader.new
1523 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1524 %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1525 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1526 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.010
1527 %4 = load half, ptr %arrayidx, align 2
1528 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.010
1529 %5 = load half, ptr %arrayidx1, align 2
1530 %mul = fmul half %4, %5
1531 %conv = fpext half %mul to float
1532 %add = fadd float %res.09, %conv
1533 %inc = or disjoint i32 %i.010, 1
1534 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1535 %6 = load half, ptr %arrayidx.1, align 2
1536 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1537 %7 = load half, ptr %arrayidx1.1, align 2
1538 %mul.1 = fmul half %6, %7
1539 %conv.1 = fpext half %mul.1 to float
1540 %add.1 = fadd float %add, %conv.1
1541 %inc.1 = or disjoint i32 %i.010, 2
1542 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1543 %8 = load half, ptr %arrayidx.2, align 2
1544 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1545 %9 = load half, ptr %arrayidx1.2, align 2
1546 %mul.2 = fmul half %8, %9
1547 %conv.2 = fpext half %mul.2 to float
1548 %add.2 = fadd float %add.1, %conv.2
1549 %inc.2 = or disjoint i32 %i.010, 3
1550 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1551 %10 = load half, ptr %arrayidx.3, align 2
1552 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1553 %11 = load half, ptr %arrayidx1.3, align 2
1554 %mul.3 = fmul half %10, %11
1555 %conv.3 = fpext half %mul.3 to float
1556 %add.3 = fadd float %add.2, %conv.3
1557 %inc.3 = add nuw i32 %i.010, 4
1558 %niter.nsub.3 = add i32 %niter, -4
1559 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1560 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1563 define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1564 ; CHECK-LABEL: half_half_acc:
1565 ; CHECK: @ %bb.0: @ %entry
1566 ; CHECK-NEXT: push {r4, r5, r7, lr}
1567 ; CHECK-NEXT: cbz r2, .LBB10_3
1568 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1569 ; CHECK-NEXT: and r12, r2, #3
1570 ; CHECK-NEXT: subs r3, r2, #1
1571 ; CHECK-NEXT: cmp r3, #3
1572 ; CHECK-NEXT: bhs .LBB10_4
1573 ; CHECK-NEXT: @ %bb.2:
1574 ; CHECK-NEXT: vldr s0, .LCPI10_0
1575 ; CHECK-NEXT: movs r2, #0
1576 ; CHECK-NEXT: b .LBB10_6
1577 ; CHECK-NEXT: .LBB10_3:
1578 ; CHECK-NEXT: vldr s0, .LCPI10_0
1579 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1580 ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
1581 ; CHECK-NEXT: bic r2, r2, #3
1582 ; CHECK-NEXT: movs r3, #1
1583 ; CHECK-NEXT: subs r2, #4
1584 ; CHECK-NEXT: vldr s0, .LCPI10_0
1585 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1586 ; CHECK-NEXT: movs r3, #0
1587 ; CHECK-NEXT: movs r2, #0
1588 ; CHECK-NEXT: .LBB10_5: @ %for.body
1589 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1590 ; CHECK-NEXT: adds r5, r0, r3
1591 ; CHECK-NEXT: adds r4, r1, r3
1592 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1593 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1594 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1595 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1596 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1597 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1598 ; CHECK-NEXT: vldr.16 s10, [r5]
1599 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1600 ; CHECK-NEXT: vadd.f16 s4, s6, s4
1601 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1602 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1603 ; CHECK-NEXT: adds r3, #8
1604 ; CHECK-NEXT: vadd.f16 s6, s8, s6
1605 ; CHECK-NEXT: vldr.16 s8, [r4]
1606 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1607 ; CHECK-NEXT: adds r2, #4
1608 ; CHECK-NEXT: vadd.f16 s8, s10, s8
1609 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1610 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1611 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1612 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1613 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1614 ; CHECK-NEXT: le lr, .LBB10_5
1615 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1616 ; CHECK-NEXT: wls lr, r12, .LBB10_9
1617 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1618 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1619 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1620 ; CHECK-NEXT: .LBB10_8: @ %for.body.epil
1621 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1622 ; CHECK-NEXT: vldr.16 s2, [r1]
1623 ; CHECK-NEXT: vldr.16 s4, [r0]
1624 ; CHECK-NEXT: adds r0, #2
1625 ; CHECK-NEXT: adds r1, #2
1626 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1627 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1628 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1629 ; CHECK-NEXT: le lr, .LBB10_8
1630 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
1631 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1632 ; CHECK-NEXT: .p2align 2
1633 ; CHECK-NEXT: @ %bb.10:
1634 ; CHECK-NEXT: .LCPI10_0:
1635 ; CHECK-NEXT: .long 0x00000000 @ float 0
1637 %cmp9 = icmp eq i32 %N, 0
1638 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
1640 for.body.preheader: ; preds = %entry
1642 %xtraiter = and i32 %N, 3
1643 %1 = icmp ult i32 %0, 3
1644 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1646 for.body.preheader.new: ; preds = %for.body.preheader
1647 %unroll_iter = sub i32 %N, %xtraiter
1650 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1651 %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
1652 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1653 %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
1654 %lcmp.mod = icmp eq i32 %xtraiter, 0
1655 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1657 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1658 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1659 %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1660 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1661 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.011.epil
1662 %2 = load half, ptr %arrayidx.epil, align 2
1663 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.011.epil
1664 %3 = load half, ptr %arrayidx1.epil, align 2
1665 %add.epil = fadd half %2, %3
1666 %conv.epil = fpext half %add.epil to float
1667 %add2.epil = fadd float %res.010.epil, %conv.epil
1668 %inc.epil = add nuw i32 %i.011.epil, 1
1669 %epil.iter.sub = add i32 %epil.iter, -1
1670 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1671 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1673 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1674 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
1675 ret float %res.0.lcssa
1677 for.body: ; preds = %for.body, %for.body.preheader.new
1678 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1679 %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
1680 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1681 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1682 %4 = load half, ptr %arrayidx, align 2
1683 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.011
1684 %5 = load half, ptr %arrayidx1, align 2
1685 %add = fadd half %4, %5
1686 %conv = fpext half %add to float
1687 %add2 = fadd float %res.010, %conv
1688 %inc = or disjoint i32 %i.011, 1
1689 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1690 %6 = load half, ptr %arrayidx.1, align 2
1691 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1692 %7 = load half, ptr %arrayidx1.1, align 2
1693 %add.1 = fadd half %6, %7
1694 %conv.1 = fpext half %add.1 to float
1695 %add2.1 = fadd float %add2, %conv.1
1696 %inc.1 = or disjoint i32 %i.011, 2
1697 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1698 %8 = load half, ptr %arrayidx.2, align 2
1699 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1700 %9 = load half, ptr %arrayidx1.2, align 2
1701 %add.2 = fadd half %8, %9
1702 %conv.2 = fpext half %add.2 to float
1703 %add2.2 = fadd float %add2.1, %conv.2
1704 %inc.2 = or disjoint i32 %i.011, 3
1705 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1706 %10 = load half, ptr %arrayidx.3, align 2
1707 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1708 %11 = load half, ptr %arrayidx1.3, align 2
1709 %add.3 = fadd half %10, %11
1710 %conv.3 = fpext half %add.3 to float
1711 %add2.3 = fadd float %add2.2, %conv.3
1712 %inc.3 = add nuw i32 %i.011, 4
1713 %niter.nsub.3 = add i32 %niter, -4
1714 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1715 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1718 define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1719 ; CHECK-LABEL: half_short_mac:
1720 ; CHECK: @ %bb.0: @ %entry
1721 ; CHECK-NEXT: push {r4, r5, r6, lr}
1722 ; CHECK-NEXT: cbz r2, .LBB11_3
1723 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1724 ; CHECK-NEXT: and r12, r2, #3
1725 ; CHECK-NEXT: subs r3, r2, #1
1726 ; CHECK-NEXT: cmp r3, #3
1727 ; CHECK-NEXT: bhs .LBB11_4
1728 ; CHECK-NEXT: @ %bb.2:
1729 ; CHECK-NEXT: vldr s0, .LCPI11_0
1730 ; CHECK-NEXT: movs r2, #0
1731 ; CHECK-NEXT: b .LBB11_6
1732 ; CHECK-NEXT: .LBB11_3:
1733 ; CHECK-NEXT: vldr s0, .LCPI11_0
1734 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1735 ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
1736 ; CHECK-NEXT: bic r2, r2, #3
1737 ; CHECK-NEXT: movs r3, #1
1738 ; CHECK-NEXT: subs r2, #4
1739 ; CHECK-NEXT: vldr s0, .LCPI11_0
1740 ; CHECK-NEXT: adds r4, r0, #4
1741 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1742 ; CHECK-NEXT: adds r3, r1, #4
1743 ; CHECK-NEXT: movs r2, #0
1744 ; CHECK-NEXT: .LBB11_5: @ %for.body
1745 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1746 ; CHECK-NEXT: ldrsh.w r5, [r3, #2]
1747 ; CHECK-NEXT: vldr.16 s2, [r4, #2]
1748 ; CHECK-NEXT: adds r2, #4
1749 ; CHECK-NEXT: vmov s4, r5
1750 ; CHECK-NEXT: ldrsh r5, [r3], #8
1751 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1752 ; CHECK-NEXT: ldrsh r6, [r3, #-10]
1753 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1754 ; CHECK-NEXT: vmov s6, r5
1755 ; CHECK-NEXT: vldr.16 s4, [r4]
1756 ; CHECK-NEXT: vcvt.f16.s32 s6, s6
1757 ; CHECK-NEXT: ldrsh r5, [r3, #-12]
1758 ; CHECK-NEXT: vmul.f16 s4, s4, s6
1759 ; CHECK-NEXT: vmov s8, r6
1760 ; CHECK-NEXT: vldr.16 s6, [r4, #-2]
1761 ; CHECK-NEXT: vcvt.f16.s32 s8, s8
1762 ; CHECK-NEXT: vmov s10, r5
1763 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1764 ; CHECK-NEXT: vmul.f16 s6, s6, s8
1765 ; CHECK-NEXT: vldr.16 s8, [r4, #-4]
1766 ; CHECK-NEXT: vcvt.f16.s32 s10, s10
1767 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1768 ; CHECK-NEXT: vmul.f16 s8, s8, s10
1769 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1770 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1771 ; CHECK-NEXT: adds r4, #8
1772 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1773 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1774 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1775 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1776 ; CHECK-NEXT: le lr, .LBB11_5
1777 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1778 ; CHECK-NEXT: wls lr, r12, .LBB11_9
1779 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1780 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1781 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1782 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil
1783 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1784 ; CHECK-NEXT: ldrsh r2, [r1], #2
1785 ; CHECK-NEXT: vldr.16 s2, [r0]
1786 ; CHECK-NEXT: adds r0, #2
1787 ; CHECK-NEXT: vmov s4, r2
1788 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1789 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1790 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1791 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1792 ; CHECK-NEXT: le lr, .LBB11_8
1793 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1794 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1795 ; CHECK-NEXT: .p2align 2
1796 ; CHECK-NEXT: @ %bb.10:
1797 ; CHECK-NEXT: .LCPI11_0:
1798 ; CHECK-NEXT: .long 0x00000000 @ float 0
1800 %cmp10 = icmp eq i32 %N, 0
1801 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1803 for.body.preheader: ; preds = %entry
1805 %xtraiter = and i32 %N, 3
1806 %1 = icmp ult i32 %0, 3
1807 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1809 for.body.preheader.new: ; preds = %for.body.preheader
1810 %unroll_iter = sub i32 %N, %xtraiter
1813 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1814 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1815 %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1816 %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1817 %lcmp.mod = icmp eq i32 %xtraiter, 0
1818 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1820 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1821 %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1822 %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1823 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1824 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.012.epil
1825 %2 = load half, ptr %arrayidx.epil, align 2
1826 %arrayidx1.epil = getelementptr inbounds i16, ptr %b, i32 %i.012.epil
1827 %3 = load i16, ptr %arrayidx1.epil, align 2
1828 %conv2.epil = sitofp i16 %3 to half
1829 %mul.epil = fmul half %2, %conv2.epil
1830 %conv3.epil = fpext half %mul.epil to float
1831 %add.epil = fadd float %res.011.epil, %conv3.epil
1832 %inc.epil = add nuw i32 %i.012.epil, 1
1833 %epil.iter.sub = add i32 %epil.iter, -1
1834 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1835 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1837 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1838 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1839 ret float %res.0.lcssa
1841 for.body: ; preds = %for.body, %for.body.preheader.new
1842 %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1843 %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1844 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1845 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.012
1846 %4 = load half, ptr %arrayidx, align 2
1847 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.012
1848 %5 = load i16, ptr %arrayidx1, align 2
1849 %conv2 = sitofp i16 %5 to half
1850 %mul = fmul half %4, %conv2
1851 %conv3 = fpext half %mul to float
1852 %add = fadd float %res.011, %conv3
1853 %inc = or disjoint i32 %i.012, 1
1854 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1855 %6 = load half, ptr %arrayidx.1, align 2
1856 %arrayidx1.1 = getelementptr inbounds i16, ptr %b, i32 %inc
1857 %7 = load i16, ptr %arrayidx1.1, align 2
1858 %conv2.1 = sitofp i16 %7 to half
1859 %mul.1 = fmul half %6, %conv2.1
1860 %conv3.1 = fpext half %mul.1 to float
1861 %add.1 = fadd float %add, %conv3.1
1862 %inc.1 = or disjoint i32 %i.012, 2
1863 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1864 %8 = load half, ptr %arrayidx.2, align 2
1865 %arrayidx1.2 = getelementptr inbounds i16, ptr %b, i32 %inc.1
1866 %9 = load i16, ptr %arrayidx1.2, align 2
1867 %conv2.2 = sitofp i16 %9 to half
1868 %mul.2 = fmul half %8, %conv2.2
1869 %conv3.2 = fpext half %mul.2 to float
1870 %add.2 = fadd float %add.1, %conv3.2
1871 %inc.2 = or disjoint i32 %i.012, 3
1872 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1873 %10 = load half, ptr %arrayidx.3, align 2
1874 %arrayidx1.3 = getelementptr inbounds i16, ptr %b, i32 %inc.2
1875 %11 = load i16, ptr %arrayidx1.3, align 2
1876 %conv2.3 = sitofp i16 %11 to half
1877 %mul.3 = fmul half %10, %conv2.3
1878 %conv3.3 = fpext half %mul.3 to float
1879 %add.3 = fadd float %add.2, %conv3.3
1880 %inc.3 = add nuw i32 %i.012, 4
1881 %niter.nsub.3 = add i32 %niter, -4
1882 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1883 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body