1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
5 ; CHECK-LABEL: float_float_mul:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
8 ; CHECK-NEXT: cmp r3, #0
9 ; CHECK-NEXT: beq .LBB0_10
10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
11 ; CHECK-NEXT: cmp r3, #3
12 ; CHECK-NEXT: bhi .LBB0_3
13 ; CHECK-NEXT: @ %bb.2:
14 ; CHECK-NEXT: mov.w r12, #0
15 ; CHECK-NEXT: b .LBB0_4
16 ; CHECK-NEXT: .LBB0_3: @ %vector.memcheck
17 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
18 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
19 ; CHECK-NEXT: cmp r5, r1
20 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
21 ; CHECK-NEXT: cset r7, hi
22 ; CHECK-NEXT: cmp r6, r2
23 ; CHECK-NEXT: cset r6, hi
24 ; CHECK-NEXT: cmp r5, r0
25 ; CHECK-NEXT: cset r5, hi
26 ; CHECK-NEXT: cmp r4, r2
27 ; CHECK-NEXT: cset r4, hi
28 ; CHECK-NEXT: mov.w r12, #0
29 ; CHECK-NEXT: tst r4, r5
31 ; CHECK-NEXT: andseq.w r7, r7, r6
32 ; CHECK-NEXT: beq .LBB0_11
33 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
34 ; CHECK-NEXT: mvn.w r7, r12
35 ; CHECK-NEXT: adds r4, r7, r3
36 ; CHECK-NEXT: and r7, r3, #3
37 ; CHECK-NEXT: wls lr, r7, .LBB0_7
38 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
39 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
40 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
41 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
42 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol
43 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
44 ; CHECK-NEXT: vldmia r6!, {s0}
45 ; CHECK-NEXT: add.w r12, r12, #1
46 ; CHECK-NEXT: vldmia r5!, {s2}
47 ; CHECK-NEXT: vmul.f32 s0, s2, s0
48 ; CHECK-NEXT: vstmia r7!, {s0}
49 ; CHECK-NEXT: le lr, .LBB0_6
50 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit
51 ; CHECK-NEXT: cmp r4, #3
52 ; CHECK-NEXT: blo .LBB0_10
53 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
54 ; CHECK-NEXT: sub.w r3, r3, r12
55 ; CHECK-NEXT: lsl.w r12, r12, #2
56 ; CHECK-NEXT: .LBB0_9: @ %for.body
57 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
58 ; CHECK-NEXT: add.w r7, r1, r12
59 ; CHECK-NEXT: add.w r6, r0, r12
60 ; CHECK-NEXT: add.w r5, r2, r12
61 ; CHECK-NEXT: adds r0, #16
62 ; CHECK-NEXT: vldr s0, [r7]
63 ; CHECK-NEXT: adds r1, #16
64 ; CHECK-NEXT: vldr s2, [r6]
65 ; CHECK-NEXT: adds r2, #16
66 ; CHECK-NEXT: subs r3, #4
67 ; CHECK-NEXT: vmul.f32 s0, s2, s0
68 ; CHECK-NEXT: vstr s0, [r5]
69 ; CHECK-NEXT: vldr s0, [r7, #4]
70 ; CHECK-NEXT: vldr s2, [r6, #4]
71 ; CHECK-NEXT: vmul.f32 s0, s2, s0
72 ; CHECK-NEXT: vstr s0, [r5, #4]
73 ; CHECK-NEXT: vldr s0, [r7, #8]
74 ; CHECK-NEXT: vldr s2, [r6, #8]
75 ; CHECK-NEXT: vmul.f32 s0, s2, s0
76 ; CHECK-NEXT: vstr s0, [r5, #8]
77 ; CHECK-NEXT: vldr s0, [r7, #12]
78 ; CHECK-NEXT: vldr s2, [r6, #12]
79 ; CHECK-NEXT: vmul.f32 s0, s2, s0
80 ; CHECK-NEXT: vstr s0, [r5, #12]
81 ; CHECK-NEXT: bne .LBB0_9
82 ; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup
83 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
84 ; CHECK-NEXT: .LBB0_11: @ %vector.ph
85 ; CHECK-NEXT: bic r12, r3, #3
86 ; CHECK-NEXT: movs r6, #1
87 ; CHECK-NEXT: sub.w r7, r12, #4
88 ; CHECK-NEXT: mov r4, r0
89 ; CHECK-NEXT: mov r5, r1
90 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
91 ; CHECK-NEXT: mov r6, r2
92 ; CHECK-NEXT: .LBB0_12: @ %vector.body
93 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
94 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
95 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
96 ; CHECK-NEXT: vmul.f32 q0, q1, q0
97 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
98 ; CHECK-NEXT: le lr, .LBB0_12
99 ; CHECK-NEXT: @ %bb.13: @ %middle.block
100 ; CHECK-NEXT: cmp r12, r3
101 ; CHECK-NEXT: bne .LBB0_4
102 ; CHECK-NEXT: b .LBB0_10
104 %cmp8 = icmp eq i32 %N, 0
105 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
107 for.body.preheader: ; preds = %entry
108 %min.iters.check = icmp ult i32 %N, 4
109 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
111 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
112 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
113 %0 = xor i32 %i.09.ph, -1
115 %xtraiter = and i32 %N, 3
116 %lcmp.mod = icmp eq i32 %xtraiter, 0
117 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
119 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
120 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
121 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
122 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
123 %2 = load float, float* %arrayidx.prol, align 4
124 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
125 %3 = load float, float* %arrayidx1.prol, align 4
126 %mul.prol = fmul float %2, %3
127 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
128 store float %mul.prol, float* %arrayidx2.prol, align 4
129 %inc.prol = add nuw i32 %i.09.prol, 1
130 %prol.iter.sub = add i32 %prol.iter, -1
131 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
132 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
134 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
135 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
136 %4 = icmp ult i32 %1, 3
137 br i1 %4, label %for.cond.cleanup, label %for.body
139 vector.memcheck: ; preds = %for.body.preheader
140 %scevgep = getelementptr float, float* %c, i32 %N
141 %scevgep13 = getelementptr float, float* %a, i32 %N
142 %scevgep16 = getelementptr float, float* %b, i32 %N
143 %bound0 = icmp ugt float* %scevgep13, %c
144 %bound1 = icmp ugt float* %scevgep, %a
145 %found.conflict = and i1 %bound0, %bound1
146 %bound018 = icmp ugt float* %scevgep16, %c
147 %bound119 = icmp ugt float* %scevgep, %b
148 %found.conflict20 = and i1 %bound018, %bound119
149 %conflict.rdx = or i1 %found.conflict, %found.conflict20
150 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
152 vector.ph: ; preds = %vector.memcheck
153 %n.vec = and i32 %N, -4
154 br label %vector.body
156 vector.body: ; preds = %vector.body, %vector.ph
157 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
158 %5 = getelementptr inbounds float, float* %a, i32 %index
159 %6 = bitcast float* %5 to <4 x float>*
160 %wide.load = load <4 x float>, <4 x float>* %6, align 4
161 %7 = getelementptr inbounds float, float* %b, i32 %index
162 %8 = bitcast float* %7 to <4 x float>*
163 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
164 %9 = fmul <4 x float> %wide.load, %wide.load21
165 %10 = getelementptr inbounds float, float* %c, i32 %index
166 %11 = bitcast float* %10 to <4 x float>*
167 store <4 x float> %9, <4 x float>* %11, align 4
168 %index.next = add i32 %index, 4
169 %12 = icmp eq i32 %index.next, %n.vec
170 br i1 %12, label %middle.block, label %vector.body
172 middle.block: ; preds = %vector.body
173 %cmp.n = icmp eq i32 %n.vec, %N
174 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
176 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
179 for.body: ; preds = %for.body.prol.loopexit, %for.body
180 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
181 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
182 %13 = load float, float* %arrayidx, align 4
183 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
184 %14 = load float, float* %arrayidx1, align 4
185 %mul = fmul float %13, %14
186 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
187 store float %mul, float* %arrayidx2, align 4
188 %inc = add nuw i32 %i.09, 1
189 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
190 %15 = load float, float* %arrayidx.1, align 4
191 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
192 %16 = load float, float* %arrayidx1.1, align 4
193 %mul.1 = fmul float %15, %16
194 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
195 store float %mul.1, float* %arrayidx2.1, align 4
196 %inc.1 = add nuw i32 %i.09, 2
197 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
198 %17 = load float, float* %arrayidx.2, align 4
199 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
200 %18 = load float, float* %arrayidx1.2, align 4
201 %mul.2 = fmul float %17, %18
202 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
203 store float %mul.2, float* %arrayidx2.2, align 4
204 %inc.2 = add nuw i32 %i.09, 3
205 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
206 %19 = load float, float* %arrayidx.3, align 4
207 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
208 %20 = load float, float* %arrayidx1.3, align 4
209 %mul.3 = fmul float %19, %20
210 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
211 store float %mul.3, float* %arrayidx2.3, align 4
212 %inc.3 = add nuw i32 %i.09, 4
213 %exitcond.3 = icmp eq i32 %inc.3, %N
214 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
217 define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
218 ; CHECK-LABEL: float_float_add:
219 ; CHECK: @ %bb.0: @ %entry
220 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
221 ; CHECK-NEXT: cmp r3, #0
222 ; CHECK-NEXT: beq .LBB1_10
223 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
224 ; CHECK-NEXT: cmp r3, #3
225 ; CHECK-NEXT: bhi .LBB1_3
226 ; CHECK-NEXT: @ %bb.2:
227 ; CHECK-NEXT: mov.w r12, #0
228 ; CHECK-NEXT: b .LBB1_4
229 ; CHECK-NEXT: .LBB1_3: @ %vector.memcheck
230 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
231 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
232 ; CHECK-NEXT: cmp r5, r1
233 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
234 ; CHECK-NEXT: cset r7, hi
235 ; CHECK-NEXT: cmp r6, r2
236 ; CHECK-NEXT: cset r6, hi
237 ; CHECK-NEXT: cmp r5, r0
238 ; CHECK-NEXT: cset r5, hi
239 ; CHECK-NEXT: cmp r4, r2
240 ; CHECK-NEXT: cset r4, hi
241 ; CHECK-NEXT: mov.w r12, #0
242 ; CHECK-NEXT: tst r4, r5
244 ; CHECK-NEXT: andseq.w r7, r7, r6
245 ; CHECK-NEXT: beq .LBB1_11
246 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
247 ; CHECK-NEXT: mvn.w r7, r12
248 ; CHECK-NEXT: adds r4, r7, r3
249 ; CHECK-NEXT: and r7, r3, #3
250 ; CHECK-NEXT: wls lr, r7, .LBB1_7
251 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
252 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
253 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
254 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
255 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol
256 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
257 ; CHECK-NEXT: vldmia r6!, {s0}
258 ; CHECK-NEXT: add.w r12, r12, #1
259 ; CHECK-NEXT: vldmia r5!, {s2}
260 ; CHECK-NEXT: vadd.f32 s0, s2, s0
261 ; CHECK-NEXT: vstmia r7!, {s0}
262 ; CHECK-NEXT: le lr, .LBB1_6
263 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit
264 ; CHECK-NEXT: cmp r4, #3
265 ; CHECK-NEXT: blo .LBB1_10
266 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
267 ; CHECK-NEXT: sub.w r3, r3, r12
268 ; CHECK-NEXT: lsl.w r12, r12, #2
269 ; CHECK-NEXT: .LBB1_9: @ %for.body
270 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
271 ; CHECK-NEXT: add.w r7, r1, r12
272 ; CHECK-NEXT: add.w r6, r0, r12
273 ; CHECK-NEXT: add.w r5, r2, r12
274 ; CHECK-NEXT: adds r0, #16
275 ; CHECK-NEXT: vldr s0, [r7]
276 ; CHECK-NEXT: adds r1, #16
277 ; CHECK-NEXT: vldr s2, [r6]
278 ; CHECK-NEXT: adds r2, #16
279 ; CHECK-NEXT: subs r3, #4
280 ; CHECK-NEXT: vadd.f32 s0, s2, s0
281 ; CHECK-NEXT: vstr s0, [r5]
282 ; CHECK-NEXT: vldr s0, [r7, #4]
283 ; CHECK-NEXT: vldr s2, [r6, #4]
284 ; CHECK-NEXT: vadd.f32 s0, s2, s0
285 ; CHECK-NEXT: vstr s0, [r5, #4]
286 ; CHECK-NEXT: vldr s0, [r7, #8]
287 ; CHECK-NEXT: vldr s2, [r6, #8]
288 ; CHECK-NEXT: vadd.f32 s0, s2, s0
289 ; CHECK-NEXT: vstr s0, [r5, #8]
290 ; CHECK-NEXT: vldr s0, [r7, #12]
291 ; CHECK-NEXT: vldr s2, [r6, #12]
292 ; CHECK-NEXT: vadd.f32 s0, s2, s0
293 ; CHECK-NEXT: vstr s0, [r5, #12]
294 ; CHECK-NEXT: bne .LBB1_9
295 ; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup
296 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
297 ; CHECK-NEXT: .LBB1_11: @ %vector.ph
298 ; CHECK-NEXT: bic r12, r3, #3
299 ; CHECK-NEXT: movs r6, #1
300 ; CHECK-NEXT: sub.w r7, r12, #4
301 ; CHECK-NEXT: mov r4, r0
302 ; CHECK-NEXT: mov r5, r1
303 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
304 ; CHECK-NEXT: mov r6, r2
305 ; CHECK-NEXT: .LBB1_12: @ %vector.body
306 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
307 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
308 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
309 ; CHECK-NEXT: vadd.f32 q0, q1, q0
310 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
311 ; CHECK-NEXT: le lr, .LBB1_12
312 ; CHECK-NEXT: @ %bb.13: @ %middle.block
313 ; CHECK-NEXT: cmp r12, r3
314 ; CHECK-NEXT: bne .LBB1_4
315 ; CHECK-NEXT: b .LBB1_10
317 %cmp8 = icmp eq i32 %N, 0
318 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
320 for.body.preheader: ; preds = %entry
321 %min.iters.check = icmp ult i32 %N, 4
322 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
324 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
325 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
326 %0 = xor i32 %i.09.ph, -1
328 %xtraiter = and i32 %N, 3
329 %lcmp.mod = icmp eq i32 %xtraiter, 0
330 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
332 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
333 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
334 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
335 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
336 %2 = load float, float* %arrayidx.prol, align 4
337 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
338 %3 = load float, float* %arrayidx1.prol, align 4
339 %add.prol = fadd float %2, %3
340 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
341 store float %add.prol, float* %arrayidx2.prol, align 4
342 %inc.prol = add nuw i32 %i.09.prol, 1
343 %prol.iter.sub = add i32 %prol.iter, -1
344 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
345 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
347 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
348 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
349 %4 = icmp ult i32 %1, 3
350 br i1 %4, label %for.cond.cleanup, label %for.body
352 vector.memcheck: ; preds = %for.body.preheader
353 %scevgep = getelementptr float, float* %c, i32 %N
354 %scevgep13 = getelementptr float, float* %a, i32 %N
355 %scevgep16 = getelementptr float, float* %b, i32 %N
356 %bound0 = icmp ugt float* %scevgep13, %c
357 %bound1 = icmp ugt float* %scevgep, %a
358 %found.conflict = and i1 %bound0, %bound1
359 %bound018 = icmp ugt float* %scevgep16, %c
360 %bound119 = icmp ugt float* %scevgep, %b
361 %found.conflict20 = and i1 %bound018, %bound119
362 %conflict.rdx = or i1 %found.conflict, %found.conflict20
363 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
365 vector.ph: ; preds = %vector.memcheck
366 %n.vec = and i32 %N, -4
367 br label %vector.body
369 vector.body: ; preds = %vector.body, %vector.ph
370 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
371 %5 = getelementptr inbounds float, float* %a, i32 %index
372 %6 = bitcast float* %5 to <4 x float>*
373 %wide.load = load <4 x float>, <4 x float>* %6, align 4
374 %7 = getelementptr inbounds float, float* %b, i32 %index
375 %8 = bitcast float* %7 to <4 x float>*
376 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
377 %9 = fadd <4 x float> %wide.load, %wide.load21
378 %10 = getelementptr inbounds float, float* %c, i32 %index
379 %11 = bitcast float* %10 to <4 x float>*
380 store <4 x float> %9, <4 x float>* %11, align 4
381 %index.next = add i32 %index, 4
382 %12 = icmp eq i32 %index.next, %n.vec
383 br i1 %12, label %middle.block, label %vector.body
385 middle.block: ; preds = %vector.body
386 %cmp.n = icmp eq i32 %n.vec, %N
387 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
389 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
392 for.body: ; preds = %for.body.prol.loopexit, %for.body
393 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
394 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
395 %13 = load float, float* %arrayidx, align 4
396 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
397 %14 = load float, float* %arrayidx1, align 4
398 %add = fadd float %13, %14
399 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
400 store float %add, float* %arrayidx2, align 4
401 %inc = add nuw i32 %i.09, 1
402 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
403 %15 = load float, float* %arrayidx.1, align 4
404 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
405 %16 = load float, float* %arrayidx1.1, align 4
406 %add.1 = fadd float %15, %16
407 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
408 store float %add.1, float* %arrayidx2.1, align 4
409 %inc.1 = add nuw i32 %i.09, 2
410 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
411 %17 = load float, float* %arrayidx.2, align 4
412 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
413 %18 = load float, float* %arrayidx1.2, align 4
414 %add.2 = fadd float %17, %18
415 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
416 store float %add.2, float* %arrayidx2.2, align 4
417 %inc.2 = add nuw i32 %i.09, 3
418 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
419 %19 = load float, float* %arrayidx.3, align 4
420 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
421 %20 = load float, float* %arrayidx1.3, align 4
422 %add.3 = fadd float %19, %20
423 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
424 store float %add.3, float* %arrayidx2.3, align 4
425 %inc.3 = add nuw i32 %i.09, 4
426 %exitcond.3 = icmp eq i32 %inc.3, %N
427 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
430 define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
431 ; CHECK-LABEL: float_float_sub:
432 ; CHECK: @ %bb.0: @ %entry
433 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
434 ; CHECK-NEXT: cmp r3, #0
435 ; CHECK-NEXT: beq .LBB2_10
436 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
437 ; CHECK-NEXT: cmp r3, #3
438 ; CHECK-NEXT: bhi .LBB2_3
439 ; CHECK-NEXT: @ %bb.2:
440 ; CHECK-NEXT: mov.w r12, #0
441 ; CHECK-NEXT: b .LBB2_4
442 ; CHECK-NEXT: .LBB2_3: @ %vector.memcheck
443 ; CHECK-NEXT: add.w r5, r2, r3, lsl #2
444 ; CHECK-NEXT: add.w r6, r1, r3, lsl #2
445 ; CHECK-NEXT: cmp r5, r1
446 ; CHECK-NEXT: add.w r4, r0, r3, lsl #2
447 ; CHECK-NEXT: cset r7, hi
448 ; CHECK-NEXT: cmp r6, r2
449 ; CHECK-NEXT: cset r6, hi
450 ; CHECK-NEXT: cmp r5, r0
451 ; CHECK-NEXT: cset r5, hi
452 ; CHECK-NEXT: cmp r4, r2
453 ; CHECK-NEXT: cset r4, hi
454 ; CHECK-NEXT: mov.w r12, #0
455 ; CHECK-NEXT: tst r4, r5
457 ; CHECK-NEXT: andseq.w r7, r7, r6
458 ; CHECK-NEXT: beq .LBB2_11
459 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
460 ; CHECK-NEXT: mvn.w r7, r12
461 ; CHECK-NEXT: adds r4, r7, r3
462 ; CHECK-NEXT: and r7, r3, #3
463 ; CHECK-NEXT: wls lr, r7, .LBB2_7
464 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
465 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
466 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
467 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
468 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol
469 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
470 ; CHECK-NEXT: vldmia r6!, {s0}
471 ; CHECK-NEXT: add.w r12, r12, #1
472 ; CHECK-NEXT: vldmia r5!, {s2}
473 ; CHECK-NEXT: vsub.f32 s0, s2, s0
474 ; CHECK-NEXT: vstmia r7!, {s0}
475 ; CHECK-NEXT: le lr, .LBB2_6
476 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit
477 ; CHECK-NEXT: cmp r4, #3
478 ; CHECK-NEXT: blo .LBB2_10
479 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1
480 ; CHECK-NEXT: sub.w r3, r3, r12
481 ; CHECK-NEXT: lsl.w r12, r12, #2
482 ; CHECK-NEXT: .LBB2_9: @ %for.body
483 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
484 ; CHECK-NEXT: add.w r7, r1, r12
485 ; CHECK-NEXT: add.w r6, r0, r12
486 ; CHECK-NEXT: add.w r5, r2, r12
487 ; CHECK-NEXT: adds r0, #16
488 ; CHECK-NEXT: vldr s0, [r7]
489 ; CHECK-NEXT: adds r1, #16
490 ; CHECK-NEXT: vldr s2, [r6]
491 ; CHECK-NEXT: adds r2, #16
492 ; CHECK-NEXT: subs r3, #4
493 ; CHECK-NEXT: vsub.f32 s0, s2, s0
494 ; CHECK-NEXT: vstr s0, [r5]
495 ; CHECK-NEXT: vldr s0, [r7, #4]
496 ; CHECK-NEXT: vldr s2, [r6, #4]
497 ; CHECK-NEXT: vsub.f32 s0, s2, s0
498 ; CHECK-NEXT: vstr s0, [r5, #4]
499 ; CHECK-NEXT: vldr s0, [r7, #8]
500 ; CHECK-NEXT: vldr s2, [r6, #8]
501 ; CHECK-NEXT: vsub.f32 s0, s2, s0
502 ; CHECK-NEXT: vstr s0, [r5, #8]
503 ; CHECK-NEXT: vldr s0, [r7, #12]
504 ; CHECK-NEXT: vldr s2, [r6, #12]
505 ; CHECK-NEXT: vsub.f32 s0, s2, s0
506 ; CHECK-NEXT: vstr s0, [r5, #12]
507 ; CHECK-NEXT: bne .LBB2_9
508 ; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup
509 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
510 ; CHECK-NEXT: .LBB2_11: @ %vector.ph
511 ; CHECK-NEXT: bic r12, r3, #3
512 ; CHECK-NEXT: movs r6, #1
513 ; CHECK-NEXT: sub.w r7, r12, #4
514 ; CHECK-NEXT: mov r4, r0
515 ; CHECK-NEXT: mov r5, r1
516 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
517 ; CHECK-NEXT: mov r6, r2
518 ; CHECK-NEXT: .LBB2_12: @ %vector.body
519 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
520 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
521 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
522 ; CHECK-NEXT: vsub.f32 q0, q1, q0
523 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
524 ; CHECK-NEXT: le lr, .LBB2_12
525 ; CHECK-NEXT: @ %bb.13: @ %middle.block
526 ; CHECK-NEXT: cmp r12, r3
527 ; CHECK-NEXT: bne .LBB2_4
528 ; CHECK-NEXT: b .LBB2_10
530 %cmp8 = icmp eq i32 %N, 0
531 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
533 for.body.preheader: ; preds = %entry
534 %min.iters.check = icmp ult i32 %N, 4
535 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
537 for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
538 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
539 %0 = xor i32 %i.09.ph, -1
541 %xtraiter = and i32 %N, 3
542 %lcmp.mod = icmp eq i32 %xtraiter, 0
543 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
545 for.body.prol: ; preds = %for.body.preheader22, %for.body.prol
546 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
547 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
548 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
549 %2 = load float, float* %arrayidx.prol, align 4
550 %arrayidx1.prol = getelementptr inbounds float, float* %b, i32 %i.09.prol
551 %3 = load float, float* %arrayidx1.prol, align 4
552 %sub.prol = fsub float %2, %3
553 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
554 store float %sub.prol, float* %arrayidx2.prol, align 4
555 %inc.prol = add nuw i32 %i.09.prol, 1
556 %prol.iter.sub = add i32 %prol.iter, -1
557 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
558 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
560 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22
561 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
562 %4 = icmp ult i32 %1, 3
563 br i1 %4, label %for.cond.cleanup, label %for.body
565 vector.memcheck: ; preds = %for.body.preheader
566 %scevgep = getelementptr float, float* %c, i32 %N
567 %scevgep13 = getelementptr float, float* %a, i32 %N
568 %scevgep16 = getelementptr float, float* %b, i32 %N
569 %bound0 = icmp ugt float* %scevgep13, %c
570 %bound1 = icmp ugt float* %scevgep, %a
571 %found.conflict = and i1 %bound0, %bound1
572 %bound018 = icmp ugt float* %scevgep16, %c
573 %bound119 = icmp ugt float* %scevgep, %b
574 %found.conflict20 = and i1 %bound018, %bound119
575 %conflict.rdx = or i1 %found.conflict, %found.conflict20
576 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
578 vector.ph: ; preds = %vector.memcheck
579 %n.vec = and i32 %N, -4
580 br label %vector.body
582 vector.body: ; preds = %vector.body, %vector.ph
583 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
584 %5 = getelementptr inbounds float, float* %a, i32 %index
585 %6 = bitcast float* %5 to <4 x float>*
586 %wide.load = load <4 x float>, <4 x float>* %6, align 4
587 %7 = getelementptr inbounds float, float* %b, i32 %index
588 %8 = bitcast float* %7 to <4 x float>*
589 %wide.load21 = load <4 x float>, <4 x float>* %8, align 4
590 %9 = fsub <4 x float> %wide.load, %wide.load21
591 %10 = getelementptr inbounds float, float* %c, i32 %index
592 %11 = bitcast float* %10 to <4 x float>*
593 store <4 x float> %9, <4 x float>* %11, align 4
594 %index.next = add i32 %index, 4
595 %12 = icmp eq i32 %index.next, %n.vec
596 br i1 %12, label %middle.block, label %vector.body
598 middle.block: ; preds = %vector.body
599 %cmp.n = icmp eq i32 %n.vec, %N
600 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
602 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
605 for.body: ; preds = %for.body.prol.loopexit, %for.body
606 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
607 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
608 %13 = load float, float* %arrayidx, align 4
609 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.09
610 %14 = load float, float* %arrayidx1, align 4
611 %sub = fsub float %13, %14
612 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
613 store float %sub, float* %arrayidx2, align 4
614 %inc = add nuw i32 %i.09, 1
615 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
616 %15 = load float, float* %arrayidx.1, align 4
617 %arrayidx1.1 = getelementptr inbounds float, float* %b, i32 %inc
618 %16 = load float, float* %arrayidx1.1, align 4
619 %sub.1 = fsub float %15, %16
620 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
621 store float %sub.1, float* %arrayidx2.1, align 4
622 %inc.1 = add nuw i32 %i.09, 2
623 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
624 %17 = load float, float* %arrayidx.2, align 4
625 %arrayidx1.2 = getelementptr inbounds float, float* %b, i32 %inc.1
626 %18 = load float, float* %arrayidx1.2, align 4
627 %sub.2 = fsub float %17, %18
628 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
629 store float %sub.2, float* %arrayidx2.2, align 4
630 %inc.2 = add nuw i32 %i.09, 3
631 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
632 %19 = load float, float* %arrayidx.3, align 4
633 %arrayidx1.3 = getelementptr inbounds float, float* %b, i32 %inc.2
634 %20 = load float, float* %arrayidx1.3, align 4
635 %sub.3 = fsub float %19, %20
636 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
637 store float %sub.3, float* %arrayidx2.3, align 4
638 %inc.3 = add nuw i32 %i.09, 4
639 %exitcond.3 = icmp eq i32 %inc.3, %N
640 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
643 define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
644 ; CHECK-LABEL: float_int_mul:
645 ; CHECK: @ %bb.0: @ %entry
646 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
647 ; CHECK-NEXT: cmp r3, #0
648 ; CHECK-NEXT: beq.w .LBB3_13
649 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
650 ; CHECK-NEXT: cmp r3, #3
651 ; CHECK-NEXT: bls .LBB3_6
652 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
653 ; CHECK-NEXT: add.w r7, r0, r3, lsl #2
654 ; CHECK-NEXT: cmp r7, r2
656 ; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2
657 ; CHECK-NEXT: cmphi r7, r0
658 ; CHECK-NEXT: bhi .LBB3_6
659 ; CHECK-NEXT: @ %bb.3: @ %vector.ph
660 ; CHECK-NEXT: bic r12, r3, #3
661 ; CHECK-NEXT: movs r6, #1
662 ; CHECK-NEXT: sub.w r7, r12, #4
663 ; CHECK-NEXT: mov r4, r0
664 ; CHECK-NEXT: mov r5, r1
665 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
666 ; CHECK-NEXT: mov r6, r2
667 ; CHECK-NEXT: .LBB3_4: @ %vector.body
668 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
669 ; CHECK-NEXT: vldrw.u32 q0, [r5], #16
670 ; CHECK-NEXT: vldrw.u32 q1, [r4], #16
671 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
672 ; CHECK-NEXT: vmul.f32 q0, q1, q0
673 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
674 ; CHECK-NEXT: le lr, .LBB3_4
675 ; CHECK-NEXT: @ %bb.5: @ %middle.block
676 ; CHECK-NEXT: cmp r12, r3
677 ; CHECK-NEXT: bne .LBB3_7
678 ; CHECK-NEXT: b .LBB3_13
679 ; CHECK-NEXT: .LBB3_6:
680 ; CHECK-NEXT: mov.w r12, #0
681 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
682 ; CHECK-NEXT: mvn.w r7, r12
683 ; CHECK-NEXT: add.w r8, r7, r3
684 ; CHECK-NEXT: and r7, r3, #3
685 ; CHECK-NEXT: wls lr, r7, .LBB3_10
686 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
687 ; CHECK-NEXT: add.w r5, r0, r12, lsl #2
688 ; CHECK-NEXT: add.w r6, r1, r12, lsl #2
689 ; CHECK-NEXT: add.w r7, r2, r12, lsl #2
690 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol
691 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
692 ; CHECK-NEXT: ldr r4, [r6], #4
693 ; CHECK-NEXT: add.w r12, r12, #1
694 ; CHECK-NEXT: vldmia r5!, {s2}
695 ; CHECK-NEXT: vmov s0, r4
696 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
697 ; CHECK-NEXT: vmul.f32 s0, s2, s0
698 ; CHECK-NEXT: vstmia r7!, {s0}
699 ; CHECK-NEXT: le lr, .LBB3_9
700 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit
701 ; CHECK-NEXT: cmp.w r8, #3
702 ; CHECK-NEXT: blo .LBB3_13
703 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1
704 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
705 ; CHECK-NEXT: sub.w r3, r3, r12
706 ; CHECK-NEXT: adds r1, #8
707 ; CHECK-NEXT: lsl.w r12, r12, #2
708 ; CHECK-NEXT: .LBB3_12: @ %for.body
709 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
710 ; CHECK-NEXT: vldr s0, [r1, #-8]
711 ; CHECK-NEXT: add.w r7, r0, r12
712 ; CHECK-NEXT: add.w r6, r2, r12
713 ; CHECK-NEXT: adds r0, #16
714 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
715 ; CHECK-NEXT: vldr s2, [r7]
716 ; CHECK-NEXT: adds r2, #16
717 ; CHECK-NEXT: subs r3, #4
718 ; CHECK-NEXT: vmul.f32 s0, s2, s0
719 ; CHECK-NEXT: vstr s0, [r6]
720 ; CHECK-NEXT: vldr s0, [r1, #-4]
721 ; CHECK-NEXT: vldr s2, [r7, #4]
722 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
723 ; CHECK-NEXT: vmul.f32 s0, s2, s0
724 ; CHECK-NEXT: vstr s0, [r6, #4]
725 ; CHECK-NEXT: vldr s0, [r1]
726 ; CHECK-NEXT: vldr s2, [r7, #8]
727 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
728 ; CHECK-NEXT: vmul.f32 s0, s2, s0
729 ; CHECK-NEXT: vstr s0, [r6, #8]
730 ; CHECK-NEXT: vldr s0, [r1, #4]
731 ; CHECK-NEXT: add.w r1, r1, #16
732 ; CHECK-NEXT: vldr s2, [r7, #12]
733 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
734 ; CHECK-NEXT: vmul.f32 s0, s2, s0
735 ; CHECK-NEXT: vstr s0, [r6, #12]
736 ; CHECK-NEXT: bne .LBB3_12
737 ; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup
738 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
740 %cmp8 = icmp eq i32 %N, 0
741 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
743 for.body.preheader: ; preds = %entry
744 %min.iters.check = icmp ult i32 %N, 4
745 br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
747 for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader
748 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
749 %0 = xor i32 %i.09.ph, -1
751 %xtraiter = and i32 %N, 3
752 %lcmp.mod = icmp eq i32 %xtraiter, 0
753 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
755 for.body.prol: ; preds = %for.body.preheader16, %for.body.prol
756 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
757 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
758 %arrayidx.prol = getelementptr inbounds float, float* %a, i32 %i.09.prol
759 %2 = load float, float* %arrayidx.prol, align 4
760 %arrayidx1.prol = getelementptr inbounds i32, i32* %b, i32 %i.09.prol
761 %3 = load i32, i32* %arrayidx1.prol, align 4
762 %conv.prol = sitofp i32 %3 to float
763 %mul.prol = fmul float %2, %conv.prol
764 %arrayidx2.prol = getelementptr inbounds float, float* %c, i32 %i.09.prol
765 store float %mul.prol, float* %arrayidx2.prol, align 4
766 %inc.prol = add nuw i32 %i.09.prol, 1
767 %prol.iter.sub = add i32 %prol.iter, -1
768 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
769 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
771 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16
772 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
773 %4 = icmp ult i32 %1, 3
774 br i1 %4, label %for.cond.cleanup, label %for.body
776 vector.memcheck: ; preds = %for.body.preheader
777 %scevgep = getelementptr float, float* %c, i32 %N
778 %scevgep13 = getelementptr float, float* %a, i32 %N
779 %bound0 = icmp ugt float* %scevgep13, %c
780 %bound1 = icmp ugt float* %scevgep, %a
781 %found.conflict = and i1 %bound0, %bound1
782 br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
784 vector.ph: ; preds = %vector.memcheck
785 %n.vec = and i32 %N, -4
786 br label %vector.body
788 vector.body: ; preds = %vector.body, %vector.ph
789 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
790 %5 = getelementptr inbounds float, float* %a, i32 %index
791 %6 = bitcast float* %5 to <4 x float>*
792 %wide.load = load <4 x float>, <4 x float>* %6, align 4
793 %7 = getelementptr inbounds i32, i32* %b, i32 %index
794 %8 = bitcast i32* %7 to <4 x i32>*
795 %wide.load15 = load <4 x i32>, <4 x i32>* %8, align 4
796 %9 = sitofp <4 x i32> %wide.load15 to <4 x float>
797 %10 = fmul <4 x float> %wide.load, %9
798 %11 = getelementptr inbounds float, float* %c, i32 %index
799 %12 = bitcast float* %11 to <4 x float>*
800 store <4 x float> %10, <4 x float>* %12, align 4
801 %index.next = add i32 %index, 4
802 %13 = icmp eq i32 %index.next, %n.vec
803 br i1 %13, label %middle.block, label %vector.body
805 middle.block: ; preds = %vector.body
806 %cmp.n = icmp eq i32 %n.vec, %N
807 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
809 for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
812 for.body: ; preds = %for.body.prol.loopexit, %for.body
813 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
814 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.09
815 %14 = load float, float* %arrayidx, align 4
816 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
817 %15 = load i32, i32* %arrayidx1, align 4
818 %conv = sitofp i32 %15 to float
819 %mul = fmul float %14, %conv
820 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
821 store float %mul, float* %arrayidx2, align 4
822 %inc = add nuw i32 %i.09, 1
823 %arrayidx.1 = getelementptr inbounds float, float* %a, i32 %inc
824 %16 = load float, float* %arrayidx.1, align 4
825 %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
826 %17 = load i32, i32* %arrayidx1.1, align 4
827 %conv.1 = sitofp i32 %17 to float
828 %mul.1 = fmul float %16, %conv.1
829 %arrayidx2.1 = getelementptr inbounds float, float* %c, i32 %inc
830 store float %mul.1, float* %arrayidx2.1, align 4
831 %inc.1 = add nuw i32 %i.09, 2
832 %arrayidx.2 = getelementptr inbounds float, float* %a, i32 %inc.1
833 %18 = load float, float* %arrayidx.2, align 4
834 %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
835 %19 = load i32, i32* %arrayidx1.2, align 4
836 %conv.2 = sitofp i32 %19 to float
837 %mul.2 = fmul float %18, %conv.2
838 %arrayidx2.2 = getelementptr inbounds float, float* %c, i32 %inc.1
839 store float %mul.2, float* %arrayidx2.2, align 4
840 %inc.2 = add nuw i32 %i.09, 3
841 %arrayidx.3 = getelementptr inbounds float, float* %a, i32 %inc.2
842 %20 = load float, float* %arrayidx.3, align 4
843 %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
844 %21 = load i32, i32* %arrayidx1.3, align 4
845 %conv.3 = sitofp i32 %21 to float
846 %mul.3 = fmul float %20, %conv.3
847 %arrayidx2.3 = getelementptr inbounds float, float* %c, i32 %inc.2
848 store float %mul.3, float* %arrayidx2.3, align 4
849 %inc.3 = add nuw i32 %i.09, 4
850 %exitcond.3 = icmp eq i32 %inc.3, %N
851 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
854 define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) {
855 ; CHECK-LABEL: float_int_int_mul:
856 ; CHECK: @ %bb.0: @ %entry
857 ; CHECK-NEXT: push {r4, r5, r6, lr}
858 ; CHECK-NEXT: cbz r3, .LBB4_8
859 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
860 ; CHECK-NEXT: cmp r3, #3
861 ; CHECK-NEXT: bhi .LBB4_3
862 ; CHECK-NEXT: @ %bb.2:
863 ; CHECK-NEXT: mov.w r12, #0
864 ; CHECK-NEXT: b .LBB4_6
865 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
866 ; CHECK-NEXT: bic r12, r3, #3
867 ; CHECK-NEXT: movs r5, #1
868 ; CHECK-NEXT: sub.w r6, r12, #4
869 ; CHECK-NEXT: mov r4, r0
870 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
871 ; CHECK-NEXT: mov r5, r1
872 ; CHECK-NEXT: mov r6, r2
873 ; CHECK-NEXT: .LBB4_4: @ %vector.body
874 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
875 ; CHECK-NEXT: vldrw.u32 q0, [r4], #16
876 ; CHECK-NEXT: vldrw.u32 q1, [r5], #16
877 ; CHECK-NEXT: vmul.i32 q0, q1, q0
878 ; CHECK-NEXT: vcvt.f32.s32 q0, q0
879 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
880 ; CHECK-NEXT: le lr, .LBB4_4
881 ; CHECK-NEXT: @ %bb.5: @ %middle.block
882 ; CHECK-NEXT: cmp r12, r3
884 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
885 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11
886 ; CHECK-NEXT: sub.w lr, r3, r12
887 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2
888 ; CHECK-NEXT: add.w r1, r1, r12, lsl #2
889 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
890 ; CHECK-NEXT: .LBB4_7: @ %for.body
891 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
892 ; CHECK-NEXT: ldr r3, [r0], #4
893 ; CHECK-NEXT: ldr r6, [r1], #4
894 ; CHECK-NEXT: muls r3, r6, r3
895 ; CHECK-NEXT: vmov s0, r3
896 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
897 ; CHECK-NEXT: vstmia r2!, {s0}
898 ; CHECK-NEXT: le lr, .LBB4_7
899 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
900 ; CHECK-NEXT: pop {r4, r5, r6, pc}
902 %cmp8 = icmp eq i32 %N, 0
903 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
905 for.body.preheader: ; preds = %entry
906 %min.iters.check = icmp ult i32 %N, 4
907 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
909 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
910 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
913 vector.ph: ; preds = %for.body.preheader
914 %n.vec = and i32 %N, -4
915 br label %vector.body
917 vector.body: ; preds = %vector.body, %vector.ph
918 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
919 %0 = getelementptr inbounds i32, i32* %a, i32 %index
920 %1 = bitcast i32* %0 to <4 x i32>*
921 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
922 %2 = getelementptr inbounds i32, i32* %b, i32 %index
923 %3 = bitcast i32* %2 to <4 x i32>*
924 %wide.load10 = load <4 x i32>, <4 x i32>* %3, align 4
925 %4 = mul nsw <4 x i32> %wide.load10, %wide.load
926 %5 = sitofp <4 x i32> %4 to <4 x float>
927 %6 = getelementptr inbounds float, float* %c, i32 %index
928 %7 = bitcast float* %6 to <4 x float>*
929 store <4 x float> %5, <4 x float>* %7, align 4
930 %index.next = add i32 %index, 4
931 %8 = icmp eq i32 %index.next, %n.vec
932 br i1 %8, label %middle.block, label %vector.body
934 middle.block: ; preds = %vector.body
935 %cmp.n = icmp eq i32 %n.vec, %N
936 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
938 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
941 for.body: ; preds = %for.body.preheader11, %for.body
942 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
943 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
944 %9 = load i32, i32* %arrayidx, align 4
945 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
946 %10 = load i32, i32* %arrayidx1, align 4
947 %mul = mul nsw i32 %10, %9
948 %conv = sitofp i32 %mul to float
949 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
950 store float %conv, float* %arrayidx2, align 4
951 %inc = add nuw i32 %i.09, 1
952 %exitcond = icmp eq i32 %inc, %N
953 br i1 %exitcond, label %for.cond.cleanup, label %for.body
956 define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
957 ; CHECK-LABEL: half_half_mul:
958 ; CHECK: @ %bb.0: @ %entry
959 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
960 ; CHECK-NEXT: cmp r3, #0
961 ; CHECK-NEXT: beq .LBB5_8
962 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
963 ; CHECK-NEXT: cmp r3, #3
964 ; CHECK-NEXT: bhi .LBB5_3
965 ; CHECK-NEXT: @ %bb.2:
966 ; CHECK-NEXT: mov.w r12, #0
967 ; CHECK-NEXT: b .LBB5_6
968 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
969 ; CHECK-NEXT: bic r12, r3, #3
970 ; CHECK-NEXT: movs r5, #1
971 ; CHECK-NEXT: sub.w r6, r12, #4
972 ; CHECK-NEXT: mov r4, r0
973 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
974 ; CHECK-NEXT: mov r5, r1
975 ; CHECK-NEXT: mov r6, r2
976 ; CHECK-NEXT: .LBB5_4: @ %vector.body
977 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
978 ; CHECK-NEXT: ldr.w r9, [r4]
979 ; CHECK-NEXT: ldr r7, [r5]
980 ; CHECK-NEXT: ldr.w r8, [r4, #4]
981 ; CHECK-NEXT: vmov.32 q0[0], r9
982 ; CHECK-NEXT: ldr.w r10, [r5, #4]
983 ; CHECK-NEXT: vmov.32 q1[0], r7
984 ; CHECK-NEXT: vmov.32 q0[1], r8
985 ; CHECK-NEXT: adds r4, #8
986 ; CHECK-NEXT: vmov.32 q1[1], r10
987 ; CHECK-NEXT: adds r5, #8
988 ; CHECK-NEXT: vmul.f16 q0, q0, q1
989 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
990 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
991 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
992 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
993 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
994 ; CHECK-NEXT: le lr, .LBB5_4
995 ; CHECK-NEXT: @ %bb.5: @ %middle.block
996 ; CHECK-NEXT: cmp r12, r3
997 ; CHECK-NEXT: beq .LBB5_8
998 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11
999 ; CHECK-NEXT: sub.w lr, r3, r12
1000 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1001 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1002 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1003 ; CHECK-NEXT: .LBB5_7: @ %for.body
1004 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1005 ; CHECK-NEXT: vldr.16 s0, [r1]
1006 ; CHECK-NEXT: vldr.16 s2, [r0]
1007 ; CHECK-NEXT: adds r0, #2
1008 ; CHECK-NEXT: adds r1, #2
1009 ; CHECK-NEXT: vmul.f16 s0, s2, s0
1010 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1011 ; CHECK-NEXT: vstmia r2!, {s0}
1012 ; CHECK-NEXT: le lr, .LBB5_7
1013 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
1014 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1016 %cmp8 = icmp eq i32 %N, 0
1017 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1019 for.body.preheader: ; preds = %entry
1020 %min.iters.check = icmp ult i32 %N, 4
1021 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1023 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1024 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1027 vector.ph: ; preds = %for.body.preheader
1028 %n.vec = and i32 %N, -4
1029 br label %vector.body
1031 vector.body: ; preds = %vector.body, %vector.ph
1032 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1033 %0 = getelementptr inbounds half, half* %a, i32 %index
1034 %1 = bitcast half* %0 to <4 x half>*
1035 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1036 %2 = getelementptr inbounds half, half* %b, i32 %index
1037 %3 = bitcast half* %2 to <4 x half>*
1038 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1039 %4 = fmul <4 x half> %wide.load, %wide.load10
1040 %5 = fpext <4 x half> %4 to <4 x float>
1041 %6 = getelementptr inbounds float, float* %c, i32 %index
1042 %7 = bitcast float* %6 to <4 x float>*
1043 store <4 x float> %5, <4 x float>* %7, align 4
1044 %index.next = add i32 %index, 4
1045 %8 = icmp eq i32 %index.next, %n.vec
1046 br i1 %8, label %middle.block, label %vector.body
1048 middle.block: ; preds = %vector.body
1049 %cmp.n = icmp eq i32 %n.vec, %N
1050 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1052 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1055 for.body: ; preds = %for.body.preheader11, %for.body
1056 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1057 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1058 %9 = load half, half* %arrayidx, align 2
1059 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1060 %10 = load half, half* %arrayidx1, align 2
1061 %mul = fmul half %9, %10
1062 %conv = fpext half %mul to float
1063 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1064 store float %conv, float* %arrayidx2, align 4
1065 %inc = add nuw i32 %i.09, 1
1066 %exitcond = icmp eq i32 %inc, %N
1067 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1070 define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
1071 ; CHECK-LABEL: half_half_add:
1072 ; CHECK: @ %bb.0: @ %entry
1073 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1074 ; CHECK-NEXT: cmp r3, #0
1075 ; CHECK-NEXT: beq .LBB6_8
1076 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1077 ; CHECK-NEXT: cmp r3, #3
1078 ; CHECK-NEXT: bhi .LBB6_3
1079 ; CHECK-NEXT: @ %bb.2:
1080 ; CHECK-NEXT: mov.w r12, #0
1081 ; CHECK-NEXT: b .LBB6_6
1082 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1083 ; CHECK-NEXT: bic r12, r3, #3
1084 ; CHECK-NEXT: movs r5, #1
1085 ; CHECK-NEXT: sub.w r6, r12, #4
1086 ; CHECK-NEXT: mov r4, r0
1087 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1088 ; CHECK-NEXT: mov r5, r1
1089 ; CHECK-NEXT: mov r6, r2
1090 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1091 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1092 ; CHECK-NEXT: ldr.w r9, [r4]
1093 ; CHECK-NEXT: ldr r7, [r5]
1094 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1095 ; CHECK-NEXT: vmov.32 q0[0], r9
1096 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1097 ; CHECK-NEXT: vmov.32 q1[0], r7
1098 ; CHECK-NEXT: vmov.32 q0[1], r8
1099 ; CHECK-NEXT: adds r4, #8
1100 ; CHECK-NEXT: vmov.32 q1[1], r10
1101 ; CHECK-NEXT: adds r5, #8
1102 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1103 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1104 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1105 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1106 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1107 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1108 ; CHECK-NEXT: le lr, .LBB6_4
1109 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1110 ; CHECK-NEXT: cmp r12, r3
1111 ; CHECK-NEXT: beq .LBB6_8
1112 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11
1113 ; CHECK-NEXT: sub.w lr, r3, r12
1114 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1115 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1116 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1117 ; CHECK-NEXT: .LBB6_7: @ %for.body
1118 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1119 ; CHECK-NEXT: vldr.16 s0, [r1]
1120 ; CHECK-NEXT: vldr.16 s2, [r0]
1121 ; CHECK-NEXT: adds r0, #2
1122 ; CHECK-NEXT: adds r1, #2
1123 ; CHECK-NEXT: vadd.f16 s0, s2, s0
1124 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1125 ; CHECK-NEXT: vstmia r2!, {s0}
1126 ; CHECK-NEXT: le lr, .LBB6_7
1127 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1128 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1130 %cmp8 = icmp eq i32 %N, 0
1131 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1133 for.body.preheader: ; preds = %entry
1134 %min.iters.check = icmp ult i32 %N, 4
1135 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1137 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1138 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1141 vector.ph: ; preds = %for.body.preheader
1142 %n.vec = and i32 %N, -4
1143 br label %vector.body
1145 vector.body: ; preds = %vector.body, %vector.ph
1146 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1147 %0 = getelementptr inbounds half, half* %a, i32 %index
1148 %1 = bitcast half* %0 to <4 x half>*
1149 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1150 %2 = getelementptr inbounds half, half* %b, i32 %index
1151 %3 = bitcast half* %2 to <4 x half>*
1152 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1153 %4 = fadd <4 x half> %wide.load, %wide.load10
1154 %5 = fpext <4 x half> %4 to <4 x float>
1155 %6 = getelementptr inbounds float, float* %c, i32 %index
1156 %7 = bitcast float* %6 to <4 x float>*
1157 store <4 x float> %5, <4 x float>* %7, align 4
1158 %index.next = add i32 %index, 4
1159 %8 = icmp eq i32 %index.next, %n.vec
1160 br i1 %8, label %middle.block, label %vector.body
1162 middle.block: ; preds = %vector.body
1163 %cmp.n = icmp eq i32 %n.vec, %N
1164 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1166 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1169 for.body: ; preds = %for.body.preheader11, %for.body
1170 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1171 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1172 %9 = load half, half* %arrayidx, align 2
1173 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1174 %10 = load half, half* %arrayidx1, align 2
1175 %add = fadd half %9, %10
1176 %conv = fpext half %add to float
1177 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1178 store float %conv, float* %arrayidx2, align 4
1179 %inc = add nuw i32 %i.09, 1
1180 %exitcond = icmp eq i32 %inc, %N
1181 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1184 define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) {
1185 ; CHECK-LABEL: half_half_sub:
1186 ; CHECK: @ %bb.0: @ %entry
1187 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1188 ; CHECK-NEXT: cmp r3, #0
1189 ; CHECK-NEXT: beq .LBB7_8
1190 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1191 ; CHECK-NEXT: cmp r3, #3
1192 ; CHECK-NEXT: bhi .LBB7_3
1193 ; CHECK-NEXT: @ %bb.2:
1194 ; CHECK-NEXT: mov.w r12, #0
1195 ; CHECK-NEXT: b .LBB7_6
1196 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1197 ; CHECK-NEXT: bic r12, r3, #3
1198 ; CHECK-NEXT: movs r5, #1
1199 ; CHECK-NEXT: sub.w r6, r12, #4
1200 ; CHECK-NEXT: mov r4, r0
1201 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1202 ; CHECK-NEXT: mov r5, r1
1203 ; CHECK-NEXT: mov r6, r2
1204 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1205 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1206 ; CHECK-NEXT: ldr.w r9, [r4]
1207 ; CHECK-NEXT: ldr r7, [r5]
1208 ; CHECK-NEXT: ldr.w r8, [r4, #4]
1209 ; CHECK-NEXT: vmov.32 q0[0], r9
1210 ; CHECK-NEXT: ldr.w r10, [r5, #4]
1211 ; CHECK-NEXT: vmov.32 q1[0], r7
1212 ; CHECK-NEXT: vmov.32 q0[1], r8
1213 ; CHECK-NEXT: adds r4, #8
1214 ; CHECK-NEXT: vmov.32 q1[1], r10
1215 ; CHECK-NEXT: adds r5, #8
1216 ; CHECK-NEXT: vsub.f16 q0, q0, q1
1217 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1218 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1219 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1220 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1221 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1222 ; CHECK-NEXT: le lr, .LBB7_4
1223 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1224 ; CHECK-NEXT: cmp r12, r3
1225 ; CHECK-NEXT: beq .LBB7_8
1226 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11
1227 ; CHECK-NEXT: sub.w lr, r3, r12
1228 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1229 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1230 ; CHECK-NEXT: add.w r2, r2, r12, lsl #2
1231 ; CHECK-NEXT: .LBB7_7: @ %for.body
1232 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1233 ; CHECK-NEXT: vldr.16 s0, [r1]
1234 ; CHECK-NEXT: vldr.16 s2, [r0]
1235 ; CHECK-NEXT: adds r0, #2
1236 ; CHECK-NEXT: adds r1, #2
1237 ; CHECK-NEXT: vsub.f16 s0, s2, s0
1238 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1239 ; CHECK-NEXT: vstmia r2!, {s0}
1240 ; CHECK-NEXT: le lr, .LBB7_7
1241 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1242 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1244 %cmp8 = icmp eq i32 %N, 0
1245 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1247 for.body.preheader: ; preds = %entry
1248 %min.iters.check = icmp ult i32 %N, 4
1249 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1251 for.body.preheader11: ; preds = %middle.block, %for.body.preheader
1252 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1255 vector.ph: ; preds = %for.body.preheader
1256 %n.vec = and i32 %N, -4
1257 br label %vector.body
1259 vector.body: ; preds = %vector.body, %vector.ph
1260 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1261 %0 = getelementptr inbounds half, half* %a, i32 %index
1262 %1 = bitcast half* %0 to <4 x half>*
1263 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1264 %2 = getelementptr inbounds half, half* %b, i32 %index
1265 %3 = bitcast half* %2 to <4 x half>*
1266 %wide.load10 = load <4 x half>, <4 x half>* %3, align 2
1267 %4 = fsub <4 x half> %wide.load, %wide.load10
1268 %5 = fpext <4 x half> %4 to <4 x float>
1269 %6 = getelementptr inbounds float, float* %c, i32 %index
1270 %7 = bitcast float* %6 to <4 x float>*
1271 store <4 x float> %5, <4 x float>* %7, align 4
1272 %index.next = add i32 %index, 4
1273 %8 = icmp eq i32 %index.next, %n.vec
1274 br i1 %8, label %middle.block, label %vector.body
1276 middle.block: ; preds = %vector.body
1277 %cmp.n = icmp eq i32 %n.vec, %N
1278 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1280 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1283 for.body: ; preds = %for.body.preheader11, %for.body
1284 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1285 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.09
1286 %9 = load half, half* %arrayidx, align 2
1287 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.09
1288 %10 = load half, half* %arrayidx1, align 2
1289 %sub = fsub half %9, %10
1290 %conv = fpext half %sub to float
1291 %arrayidx2 = getelementptr inbounds float, float* %c, i32 %i.09
1292 store float %conv, float* %arrayidx2, align 4
1293 %inc = add nuw i32 %i.09, 1
1294 %exitcond = icmp eq i32 %inc, %N
1295 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1298 define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) {
1299 ; CHECK-LABEL: half_short_mul:
1300 ; CHECK: @ %bb.0: @ %entry
1301 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1302 ; CHECK-NEXT: sub sp, #4
1303 ; CHECK-NEXT: cmp r3, #0
1304 ; CHECK-NEXT: beq .LBB8_8
1305 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1306 ; CHECK-NEXT: cmp r3, #3
1307 ; CHECK-NEXT: bhi .LBB8_3
1308 ; CHECK-NEXT: @ %bb.2:
1309 ; CHECK-NEXT: movs r7, #0
1310 ; CHECK-NEXT: b .LBB8_6
1311 ; CHECK-NEXT: .LBB8_3: @ %vector.ph
1312 ; CHECK-NEXT: bic r7, r3, #3
1313 ; CHECK-NEXT: str r7, [sp] @ 4-byte Spill
1314 ; CHECK-NEXT: subs r6, r7, #4
1315 ; CHECK-NEXT: movs r5, #1
1316 ; CHECK-NEXT: mov r4, r0
1317 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
1318 ; CHECK-NEXT: mov r5, r1
1319 ; CHECK-NEXT: mov r6, r2
1320 ; CHECK-NEXT: .LBB8_4: @ %vector.body
1321 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1322 ; CHECK-NEXT: vldrh.u32 q0, [r5], #8
1323 ; CHECK-NEXT: ldr.w r9, [r4]
1324 ; CHECK-NEXT: ldr.w r10, [r4, #4]
1325 ; CHECK-NEXT: adds r4, #8
1326 ; CHECK-NEXT: vmov r7, r12, d0
1327 ; CHECK-NEXT: vmov.32 q1[0], r9
1328 ; CHECK-NEXT: vmov r11, r8, d1
1329 ; CHECK-NEXT: vmov.16 q0[0], r7
1330 ; CHECK-NEXT: vmov.16 q0[1], r12
1331 ; CHECK-NEXT: vmov.32 q1[1], r10
1332 ; CHECK-NEXT: vmov.16 q0[2], r11
1333 ; CHECK-NEXT: vmov.16 q0[3], r8
1334 ; CHECK-NEXT: vcvt.f16.s16 q0, q0
1335 ; CHECK-NEXT: vmul.f16 q0, q1, q0
1336 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
1337 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
1338 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
1339 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1340 ; CHECK-NEXT: vstrb.8 q0, [r6], #16
1341 ; CHECK-NEXT: le lr, .LBB8_4
1342 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1343 ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
1344 ; CHECK-NEXT: cmp r7, r3
1345 ; CHECK-NEXT: beq .LBB8_8
1346 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13
1347 ; CHECK-NEXT: sub.w lr, r3, r7
1348 ; CHECK-NEXT: add.w r0, r0, r7, lsl #1
1349 ; CHECK-NEXT: add.w r1, r1, r7, lsl #1
1350 ; CHECK-NEXT: add.w r2, r2, r7, lsl #2
1351 ; CHECK-NEXT: .LBB8_7: @ %for.body
1352 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1353 ; CHECK-NEXT: ldrsh r3, [r1], #2
1354 ; CHECK-NEXT: vldr.16 s0, [r0]
1355 ; CHECK-NEXT: adds r0, #2
1356 ; CHECK-NEXT: vmov s2, r3
1357 ; CHECK-NEXT: vcvt.f16.s32 s2, s2
1358 ; CHECK-NEXT: vmul.f16 s0, s0, s2
1359 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
1360 ; CHECK-NEXT: vstmia r2!, {s0}
1361 ; CHECK-NEXT: le lr, .LBB8_7
1362 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup
1363 ; CHECK-NEXT: add sp, #4
1364 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1366 %cmp10 = icmp eq i32 %N, 0
1367 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1369 for.body.preheader: ; preds = %entry
1370 %min.iters.check = icmp ult i32 %N, 4
1371 br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
1373 for.body.preheader13: ; preds = %middle.block, %for.body.preheader
1374 %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1377 vector.ph: ; preds = %for.body.preheader
1378 %n.vec = and i32 %N, -4
1379 br label %vector.body
1381 vector.body: ; preds = %vector.body, %vector.ph
1382 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1383 %0 = getelementptr inbounds half, half* %a, i32 %index
1384 %1 = bitcast half* %0 to <4 x half>*
1385 %wide.load = load <4 x half>, <4 x half>* %1, align 2
1386 %2 = getelementptr inbounds i16, i16* %b, i32 %index
1387 %3 = bitcast i16* %2 to <4 x i16>*
1388 %wide.load12 = load <4 x i16>, <4 x i16>* %3, align 2
1389 %4 = sitofp <4 x i16> %wide.load12 to <4 x half>
1390 %5 = fmul <4 x half> %wide.load, %4
1391 %6 = fpext <4 x half> %5 to <4 x float>
1392 %7 = getelementptr inbounds float, float* %c, i32 %index
1393 %8 = bitcast float* %7 to <4 x float>*
1394 store <4 x float> %6, <4 x float>* %8, align 4
1395 %index.next = add i32 %index, 4
1396 %9 = icmp eq i32 %index.next, %n.vec
1397 br i1 %9, label %middle.block, label %vector.body
1399 middle.block: ; preds = %vector.body
1400 %cmp.n = icmp eq i32 %n.vec, %N
1401 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
1403 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1406 for.body: ; preds = %for.body.preheader13, %for.body
1407 %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
1408 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
1409 %10 = load half, half* %arrayidx, align 2
1410 %arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.011
1411 %11 = load i16, i16* %arrayidx1, align 2
1412 %conv2 = sitofp i16 %11 to half
1413 %mul = fmul half %10, %conv2
1414 %conv3 = fpext half %mul to float
1415 %arrayidx4 = getelementptr inbounds float, float* %c, i32 %i.011
1416 store float %conv3, float* %arrayidx4, align 4
1417 %inc = add nuw i32 %i.011, 1
1418 %exitcond = icmp eq i32 %inc, %N
1419 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1422 define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
1423 ; CHECK-LABEL: half_half_mac:
1424 ; CHECK: @ %bb.0: @ %entry
1425 ; CHECK-NEXT: push {r4, r5, r7, lr}
1426 ; CHECK-NEXT: cbz r2, .LBB9_3
1427 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1428 ; CHECK-NEXT: subs r3, r2, #1
1429 ; CHECK-NEXT: and r12, r2, #3
1430 ; CHECK-NEXT: vldr s0, .LCPI9_0
1431 ; CHECK-NEXT: cmp r3, #3
1432 ; CHECK-NEXT: bhs .LBB9_4
1433 ; CHECK-NEXT: @ %bb.2:
1434 ; CHECK-NEXT: movs r2, #0
1435 ; CHECK-NEXT: b .LBB9_6
1436 ; CHECK-NEXT: .LBB9_3:
1437 ; CHECK-NEXT: vldr s0, .LCPI9_0
1438 ; CHECK-NEXT: b .LBB9_9
1439 ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
1440 ; CHECK-NEXT: sub.w lr, r2, r12
1441 ; CHECK-NEXT: movs r3, #0
1442 ; CHECK-NEXT: movs r2, #0
1443 ; CHECK-NEXT: .LBB9_5: @ %for.body
1444 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1445 ; CHECK-NEXT: adds r5, r0, r3
1446 ; CHECK-NEXT: adds r4, r1, r3
1447 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1448 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1449 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1450 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1451 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1452 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1453 ; CHECK-NEXT: vldr.16 s10, [r5]
1454 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1455 ; CHECK-NEXT: vmul.f16 s4, s6, s4
1456 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1457 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1458 ; CHECK-NEXT: adds r2, #4
1459 ; CHECK-NEXT: vmul.f16 s6, s8, s6
1460 ; CHECK-NEXT: vldr.16 s8, [r4]
1461 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1462 ; CHECK-NEXT: adds r3, #8
1463 ; CHECK-NEXT: vmul.f16 s8, s10, s8
1464 ; CHECK-NEXT: cmp lr, r2
1465 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1466 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1467 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1468 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1469 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1470 ; CHECK-NEXT: bne .LBB9_5
1471 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1472 ; CHECK-NEXT: wls lr, r12, .LBB9_9
1473 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1474 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1475 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1476 ; CHECK-NEXT: .LBB9_8: @ %for.body.epil
1477 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1478 ; CHECK-NEXT: vldr.16 s2, [r1]
1479 ; CHECK-NEXT: vldr.16 s4, [r0]
1480 ; CHECK-NEXT: adds r0, #2
1481 ; CHECK-NEXT: adds r1, #2
1482 ; CHECK-NEXT: vmul.f16 s2, s4, s2
1483 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1484 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1485 ; CHECK-NEXT: le lr, .LBB9_8
1486 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
1487 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1488 ; CHECK-NEXT: .p2align 2
1489 ; CHECK-NEXT: @ %bb.10:
1490 ; CHECK-NEXT: .LCPI9_0:
1491 ; CHECK-NEXT: .long 0x00000000 @ float 0
1493 %cmp8 = icmp eq i32 %N, 0
1494 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1496 for.body.preheader: ; preds = %entry
1498 %xtraiter = and i32 %N, 3
1499 %1 = icmp ult i32 %0, 3
1500 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1502 for.body.preheader.new: ; preds = %for.body.preheader
1503 %unroll_iter = sub i32 %N, %xtraiter
1506 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1507 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1508 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1509 %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1510 %lcmp.mod = icmp eq i32 %xtraiter, 0
1511 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1513 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1514 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1515 %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1516 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1517 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.010.epil
1518 %2 = load half, half* %arrayidx.epil, align 2
1519 %arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.010.epil
1520 %3 = load half, half* %arrayidx1.epil, align 2
1521 %mul.epil = fmul half %2, %3
1522 %conv.epil = fpext half %mul.epil to float
1523 %add.epil = fadd float %res.09.epil, %conv.epil
1524 %inc.epil = add nuw i32 %i.010.epil, 1
1525 %epil.iter.sub = add i32 %epil.iter, -1
1526 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1527 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1529 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1530 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1531 ret float %res.0.lcssa
1533 for.body: ; preds = %for.body, %for.body.preheader.new
1534 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1535 %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1536 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1537 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.010
1538 %4 = load half, half* %arrayidx, align 2
1539 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.010
1540 %5 = load half, half* %arrayidx1, align 2
1541 %mul = fmul half %4, %5
1542 %conv = fpext half %mul to float
1543 %add = fadd float %res.09, %conv
1544 %inc = or i32 %i.010, 1
1545 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1546 %6 = load half, half* %arrayidx.1, align 2
1547 %arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
1548 %7 = load half, half* %arrayidx1.1, align 2
1549 %mul.1 = fmul half %6, %7
1550 %conv.1 = fpext half %mul.1 to float
1551 %add.1 = fadd float %add, %conv.1
1552 %inc.1 = or i32 %i.010, 2
1553 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1554 %8 = load half, half* %arrayidx.2, align 2
1555 %arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
1556 %9 = load half, half* %arrayidx1.2, align 2
1557 %mul.2 = fmul half %8, %9
1558 %conv.2 = fpext half %mul.2 to float
1559 %add.2 = fadd float %add.1, %conv.2
1560 %inc.2 = or i32 %i.010, 3
1561 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
1562 %10 = load half, half* %arrayidx.3, align 2
1563 %arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
1564 %11 = load half, half* %arrayidx1.3, align 2
1565 %mul.3 = fmul half %10, %11
1566 %conv.3 = fpext half %mul.3 to float
1567 %add.3 = fadd float %add.2, %conv.3
1568 %inc.3 = add nuw i32 %i.010, 4
1569 %niter.nsub.3 = add i32 %niter, -4
1570 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1571 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1574 define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
1575 ; CHECK-LABEL: half_half_acc:
1576 ; CHECK: @ %bb.0: @ %entry
1577 ; CHECK-NEXT: push {r4, r5, r7, lr}
1578 ; CHECK-NEXT: cbz r2, .LBB10_3
1579 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1580 ; CHECK-NEXT: subs r3, r2, #1
1581 ; CHECK-NEXT: and r12, r2, #3
1582 ; CHECK-NEXT: vldr s0, .LCPI10_0
1583 ; CHECK-NEXT: cmp r3, #3
1584 ; CHECK-NEXT: bhs .LBB10_4
1585 ; CHECK-NEXT: @ %bb.2:
1586 ; CHECK-NEXT: movs r2, #0
1587 ; CHECK-NEXT: b .LBB10_6
1588 ; CHECK-NEXT: .LBB10_3:
1589 ; CHECK-NEXT: vldr s0, .LCPI10_0
1590 ; CHECK-NEXT: b .LBB10_9
1591 ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
1592 ; CHECK-NEXT: sub.w lr, r2, r12
1593 ; CHECK-NEXT: movs r3, #0
1594 ; CHECK-NEXT: movs r2, #0
1595 ; CHECK-NEXT: .LBB10_5: @ %for.body
1596 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1597 ; CHECK-NEXT: adds r5, r0, r3
1598 ; CHECK-NEXT: adds r4, r1, r3
1599 ; CHECK-NEXT: vldr.16 s2, [r4, #6]
1600 ; CHECK-NEXT: vldr.16 s4, [r5, #6]
1601 ; CHECK-NEXT: vldr.16 s6, [r5, #4]
1602 ; CHECK-NEXT: vldr.16 s8, [r5, #2]
1603 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1604 ; CHECK-NEXT: vldr.16 s4, [r4, #4]
1605 ; CHECK-NEXT: vldr.16 s10, [r5]
1606 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1607 ; CHECK-NEXT: vadd.f16 s4, s6, s4
1608 ; CHECK-NEXT: vldr.16 s6, [r4, #2]
1609 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1610 ; CHECK-NEXT: adds r2, #4
1611 ; CHECK-NEXT: vadd.f16 s6, s8, s6
1612 ; CHECK-NEXT: vldr.16 s8, [r4]
1613 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1614 ; CHECK-NEXT: adds r3, #8
1615 ; CHECK-NEXT: vadd.f16 s8, s10, s8
1616 ; CHECK-NEXT: cmp lr, r2
1617 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1618 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1619 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1620 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1621 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1622 ; CHECK-NEXT: bne .LBB10_5
1623 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1624 ; CHECK-NEXT: wls lr, r12, .LBB10_9
1625 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1626 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1627 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1628 ; CHECK-NEXT: .LBB10_8: @ %for.body.epil
1629 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1630 ; CHECK-NEXT: vldr.16 s2, [r1]
1631 ; CHECK-NEXT: vldr.16 s4, [r0]
1632 ; CHECK-NEXT: adds r0, #2
1633 ; CHECK-NEXT: adds r1, #2
1634 ; CHECK-NEXT: vadd.f16 s2, s4, s2
1635 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1636 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1637 ; CHECK-NEXT: le lr, .LBB10_8
1638 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
1639 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1640 ; CHECK-NEXT: .p2align 2
1641 ; CHECK-NEXT: @ %bb.10:
1642 ; CHECK-NEXT: .LCPI10_0:
1643 ; CHECK-NEXT: .long 0x00000000 @ float 0
1645 %cmp9 = icmp eq i32 %N, 0
1646 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
1648 for.body.preheader: ; preds = %entry
1650 %xtraiter = and i32 %N, 3
1651 %1 = icmp ult i32 %0, 3
1652 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1654 for.body.preheader.new: ; preds = %for.body.preheader
1655 %unroll_iter = sub i32 %N, %xtraiter
1658 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1659 %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
1660 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1661 %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
1662 %lcmp.mod = icmp eq i32 %xtraiter, 0
1663 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1665 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1666 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1667 %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1668 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1669 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.011.epil
1670 %2 = load half, half* %arrayidx.epil, align 2
1671 %arrayidx1.epil = getelementptr inbounds half, half* %b, i32 %i.011.epil
1672 %3 = load half, half* %arrayidx1.epil, align 2
1673 %add.epil = fadd half %2, %3
1674 %conv.epil = fpext half %add.epil to float
1675 %add2.epil = fadd float %res.010.epil, %conv.epil
1676 %inc.epil = add nuw i32 %i.011.epil, 1
1677 %epil.iter.sub = add i32 %epil.iter, -1
1678 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1679 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1681 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1682 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
1683 ret float %res.0.lcssa
1685 for.body: ; preds = %for.body, %for.body.preheader.new
1686 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1687 %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
1688 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1689 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.011
1690 %4 = load half, half* %arrayidx, align 2
1691 %arrayidx1 = getelementptr inbounds half, half* %b, i32 %i.011
1692 %5 = load half, half* %arrayidx1, align 2
1693 %add = fadd half %4, %5
1694 %conv = fpext half %add to float
1695 %add2 = fadd float %res.010, %conv
1696 %inc = or i32 %i.011, 1
1697 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1698 %6 = load half, half* %arrayidx.1, align 2
1699 %arrayidx1.1 = getelementptr inbounds half, half* %b, i32 %inc
1700 %7 = load half, half* %arrayidx1.1, align 2
1701 %add.1 = fadd half %6, %7
1702 %conv.1 = fpext half %add.1 to float
1703 %add2.1 = fadd float %add2, %conv.1
1704 %inc.1 = or i32 %i.011, 2
1705 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1706 %8 = load half, half* %arrayidx.2, align 2
1707 %arrayidx1.2 = getelementptr inbounds half, half* %b, i32 %inc.1
1708 %9 = load half, half* %arrayidx1.2, align 2
1709 %add.2 = fadd half %8, %9
1710 %conv.2 = fpext half %add.2 to float
1711 %add2.2 = fadd float %add2.1, %conv.2
1712 %inc.2 = or i32 %i.011, 3
1713 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
1714 %10 = load half, half* %arrayidx.3, align 2
1715 %arrayidx1.3 = getelementptr inbounds half, half* %b, i32 %inc.2
1716 %11 = load half, half* %arrayidx1.3, align 2
1717 %add.3 = fadd half %10, %11
1718 %conv.3 = fpext half %add.3 to float
1719 %add2.3 = fadd float %add2.2, %conv.3
1720 %inc.3 = add nuw i32 %i.011, 4
1721 %niter.nsub.3 = add i32 %niter, -4
1722 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1723 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1726 define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
1727 ; CHECK-LABEL: half_short_mac:
1728 ; CHECK: @ %bb.0: @ %entry
1729 ; CHECK-NEXT: push {r4, r5, r6, lr}
1730 ; CHECK-NEXT: cbz r2, .LBB11_3
1731 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1732 ; CHECK-NEXT: subs r3, r2, #1
1733 ; CHECK-NEXT: and r12, r2, #3
1734 ; CHECK-NEXT: vldr s0, .LCPI11_0
1735 ; CHECK-NEXT: cmp r3, #3
1736 ; CHECK-NEXT: bhs .LBB11_4
1737 ; CHECK-NEXT: @ %bb.2:
1738 ; CHECK-NEXT: movs r2, #0
1739 ; CHECK-NEXT: b .LBB11_6
1740 ; CHECK-NEXT: .LBB11_3:
1741 ; CHECK-NEXT: vldr s0, .LCPI11_0
1742 ; CHECK-NEXT: b .LBB11_9
1743 ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
1744 ; CHECK-NEXT: sub.w lr, r2, r12
1745 ; CHECK-NEXT: adds r3, r1, #4
1746 ; CHECK-NEXT: adds r4, r0, #4
1747 ; CHECK-NEXT: movs r2, #0
1748 ; CHECK-NEXT: .LBB11_5: @ %for.body
1749 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1750 ; CHECK-NEXT: ldrsh.w r5, [r3, #2]
1751 ; CHECK-NEXT: vldr.16 s2, [r4, #2]
1752 ; CHECK-NEXT: adds r2, #4
1753 ; CHECK-NEXT: cmp lr, r2
1754 ; CHECK-NEXT: vmov s4, r5
1755 ; CHECK-NEXT: ldrsh r5, [r3], #8
1756 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1757 ; CHECK-NEXT: ldrsh r6, [r3, #-10]
1758 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1759 ; CHECK-NEXT: vmov s6, r5
1760 ; CHECK-NEXT: vldr.16 s4, [r4]
1761 ; CHECK-NEXT: vcvt.f16.s32 s6, s6
1762 ; CHECK-NEXT: ldrsh r5, [r3, #-12]
1763 ; CHECK-NEXT: vmul.f16 s4, s4, s6
1764 ; CHECK-NEXT: vmov s8, r6
1765 ; CHECK-NEXT: vldr.16 s6, [r4, #-2]
1766 ; CHECK-NEXT: vcvt.f16.s32 s8, s8
1767 ; CHECK-NEXT: vmov s10, r5
1768 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1769 ; CHECK-NEXT: vmul.f16 s6, s6, s8
1770 ; CHECK-NEXT: vldr.16 s8, [r4, #-4]
1771 ; CHECK-NEXT: vcvt.f16.s32 s10, s10
1772 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1773 ; CHECK-NEXT: vmul.f16 s8, s8, s10
1774 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1775 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1776 ; CHECK-NEXT: add.w r4, r4, #8
1777 ; CHECK-NEXT: vadd.f32 s0, s0, s8
1778 ; CHECK-NEXT: vadd.f32 s0, s0, s6
1779 ; CHECK-NEXT: vadd.f32 s0, s0, s4
1780 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1781 ; CHECK-NEXT: bne .LBB11_5
1782 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1783 ; CHECK-NEXT: wls lr, r12, .LBB11_9
1784 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1785 ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1786 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1787 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil
1788 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1789 ; CHECK-NEXT: ldrsh r2, [r1], #2
1790 ; CHECK-NEXT: vldr.16 s2, [r0]
1791 ; CHECK-NEXT: adds r0, #2
1792 ; CHECK-NEXT: vmov s4, r2
1793 ; CHECK-NEXT: vcvt.f16.s32 s4, s4
1794 ; CHECK-NEXT: vmul.f16 s2, s2, s4
1795 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1796 ; CHECK-NEXT: vadd.f32 s0, s0, s2
1797 ; CHECK-NEXT: le lr, .LBB11_8
1798 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1799 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1800 ; CHECK-NEXT: .p2align 2
1801 ; CHECK-NEXT: @ %bb.10:
1802 ; CHECK-NEXT: .LCPI11_0:
1803 ; CHECK-NEXT: .long 0x00000000 @ float 0
1805 %cmp10 = icmp eq i32 %N, 0
1806 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1808 for.body.preheader: ; preds = %entry
1810 %xtraiter = and i32 %N, 3
1811 %1 = icmp ult i32 %0, 3
1812 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1814 for.body.preheader.new: ; preds = %for.body.preheader
1815 %unroll_iter = sub i32 %N, %xtraiter
1818 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1819 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1820 %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1821 %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1822 %lcmp.mod = icmp eq i32 %xtraiter, 0
1823 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1825 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1826 %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1827 %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1828 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1829 %arrayidx.epil = getelementptr inbounds half, half* %a, i32 %i.012.epil
1830 %2 = load half, half* %arrayidx.epil, align 2
1831 %arrayidx1.epil = getelementptr inbounds i16, i16* %b, i32 %i.012.epil
1832 %3 = load i16, i16* %arrayidx1.epil, align 2
1833 %conv2.epil = sitofp i16 %3 to half
1834 %mul.epil = fmul half %2, %conv2.epil
1835 %conv3.epil = fpext half %mul.epil to float
1836 %add.epil = fadd float %res.011.epil, %conv3.epil
1837 %inc.epil = add nuw i32 %i.012.epil, 1
1838 %epil.iter.sub = add i32 %epil.iter, -1
1839 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1840 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1842 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1843 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1844 ret float %res.0.lcssa
1846 for.body: ; preds = %for.body, %for.body.preheader.new
1847 %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1848 %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1849 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1850 %arrayidx = getelementptr inbounds half, half* %a, i32 %i.012
1851 %4 = load half, half* %arrayidx, align 2
1852 %arrayidx1 = getelementptr inbounds i16, i16* %b, i32 %i.012
1853 %5 = load i16, i16* %arrayidx1, align 2
1854 %conv2 = sitofp i16 %5 to half
1855 %mul = fmul half %4, %conv2
1856 %conv3 = fpext half %mul to float
1857 %add = fadd float %res.011, %conv3
1858 %inc = or i32 %i.012, 1
1859 %arrayidx.1 = getelementptr inbounds half, half* %a, i32 %inc
1860 %6 = load half, half* %arrayidx.1, align 2
1861 %arrayidx1.1 = getelementptr inbounds i16, i16* %b, i32 %inc
1862 %7 = load i16, i16* %arrayidx1.1, align 2
1863 %conv2.1 = sitofp i16 %7 to half
1864 %mul.1 = fmul half %6, %conv2.1
1865 %conv3.1 = fpext half %mul.1 to float
1866 %add.1 = fadd float %add, %conv3.1
1867 %inc.1 = or i32 %i.012, 2
1868 %arrayidx.2 = getelementptr inbounds half, half* %a, i32 %inc.1
1869 %8 = load half, half* %arrayidx.2, align 2
1870 %arrayidx1.2 = getelementptr inbounds i16, i16* %b, i32 %inc.1
1871 %9 = load i16, i16* %arrayidx1.2, align 2
1872 %conv2.2 = sitofp i16 %9 to half
1873 %mul.2 = fmul half %8, %conv2.2
1874 %conv3.2 = fpext half %mul.2 to float
1875 %add.2 = fadd float %add.1, %conv3.2
1876 %inc.2 = or i32 %i.012, 3
1877 %arrayidx.3 = getelementptr inbounds half, half* %a, i32 %inc.2
1878 %10 = load half, half* %arrayidx.3, align 2
1879 %arrayidx1.3 = getelementptr inbounds i16, i16* %b, i32 %inc.2
1880 %11 = load i16, i16* %arrayidx1.3, align 2
1881 %conv2.3 = sitofp i16 %11 to half
1882 %mul.3 = fmul half %10, %conv2.3
1883 %conv3.3 = fpext half %mul.3 to float
1884 %add.3 = fadd float %add.2, %conv3.3
1885 %inc.3 = add nuw i32 %i.012, 4
1886 %niter.nsub.3 = add i32 %niter, -4
1887 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1888 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body