1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r4, lr}
8 ; CHECK-NEXT: push {r4, lr}
9 ; CHECK-NEXT: cmp r3, #1
11 ; CHECK-NEXT: poplt {r4, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
13 ; CHECK-NEXT: vmov r12, s0
14 ; CHECK-NEXT: dlstp.32 lr, r3
15 ; CHECK-NEXT: .LBB0_2: @ %vector.body
16 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
17 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
18 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
19 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
20 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
21 ; CHECK-NEXT: letp lr, .LBB0_2
22 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
23 ; CHECK-NEXT: pop {r4, pc}
25 %cmp8 = icmp sgt i32 %n, 0
26 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
28 vector.ph: ; preds = %entry
29 %n.rnd.up = add i32 %n, 3
30 %n.vec = and i32 %n.rnd.up, -4
31 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
32 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
35 vector.body: ; preds = %vector.body, %vector.ph
36 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37 %0 = getelementptr inbounds float, float* %x, i32 %index
38 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
39 %2 = bitcast float* %0 to <4 x float>*
40 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
41 %3 = getelementptr inbounds float, float* %y, i32 %index
42 %4 = bitcast float* %3 to <4 x float>*
43 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
44 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
45 %6 = getelementptr inbounds float, float* %z, i32 %index
46 %7 = bitcast float* %6 to <4 x float>*
47 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
48 %index.next = add i32 %index, 4
49 %8 = icmp eq i32 %index.next, %n.vec
50 br i1 %8, label %for.cond.cleanup, label %vector.body
52 for.cond.cleanup: ; preds = %vector.body, %entry
56 define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
58 ; CHECK: @ %bb.0: @ %entry
59 ; CHECK-NEXT: .save {r4, lr}
60 ; CHECK-NEXT: push {r4, lr}
61 ; CHECK-NEXT: cmp r3, #1
63 ; CHECK-NEXT: poplt {r4, pc}
64 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
65 ; CHECK-NEXT: vmov r12, s0
66 ; CHECK-NEXT: dlstp.32 lr, r3
67 ; CHECK-NEXT: .LBB1_2: @ %vector.body
68 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
69 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
70 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
71 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
72 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
73 ; CHECK-NEXT: letp lr, .LBB1_2
74 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
75 ; CHECK-NEXT: pop {r4, pc}
77 %cmp8 = icmp sgt i32 %n, 0
78 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
80 vector.ph: ; preds = %entry
81 %n.rnd.up = add i32 %n, 3
82 %n.vec = and i32 %n.rnd.up, -4
83 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
84 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
87 vector.body: ; preds = %vector.body, %vector.ph
88 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
89 %0 = getelementptr inbounds float, float* %x, i32 %index
90 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
91 %2 = bitcast float* %0 to <4 x float>*
92 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
93 %3 = getelementptr inbounds float, float* %y, i32 %index
94 %4 = bitcast float* %3 to <4 x float>*
95 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
96 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
97 %6 = fadd fast <4 x float> %5, %broadcast.splat14
98 %7 = getelementptr inbounds float, float* %z, i32 %index
99 %8 = bitcast float* %7 to <4 x float>*
100 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
101 %index.next = add i32 %index, 4
102 %9 = icmp eq i32 %index.next, %n.vec
103 br i1 %9, label %for.cond.cleanup, label %vector.body
105 for.cond.cleanup: ; preds = %vector.body, %entry
109 define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
111 ; CHECK: @ %bb.0: @ %entry
112 ; CHECK-NEXT: .save {r4, lr}
113 ; CHECK-NEXT: push {r4, lr}
114 ; CHECK-NEXT: cmp r3, #1
116 ; CHECK-NEXT: poplt {r4, pc}
117 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
118 ; CHECK-NEXT: vmov r12, s0
119 ; CHECK-NEXT: dlstp.32 lr, r3
120 ; CHECK-NEXT: .LBB2_2: @ %vector.body
121 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
122 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
123 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
124 ; CHECK-NEXT: vfma.f32 q1, q0, r12
125 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
126 ; CHECK-NEXT: letp lr, .LBB2_2
127 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
128 ; CHECK-NEXT: pop {r4, pc}
130 %cmp8 = icmp sgt i32 %n, 0
131 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
133 vector.ph: ; preds = %entry
134 %n.rnd.up = add i32 %n, 3
135 %n.vec = and i32 %n.rnd.up, -4
136 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
137 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
138 br label %vector.body
140 vector.body: ; preds = %vector.body, %vector.ph
141 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
142 %0 = getelementptr inbounds float, float* %x, i32 %index
143 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
144 %2 = bitcast float* %0 to <4 x float>*
145 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
146 %3 = getelementptr inbounds float, float* %y, i32 %index
147 %4 = bitcast float* %3 to <4 x float>*
148 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
149 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
150 %6 = getelementptr inbounds float, float* %z, i32 %index
151 %7 = bitcast float* %6 to <4 x float>*
152 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
153 %index.next = add i32 %index, 4
154 %8 = icmp eq i32 %index.next, %n.vec
155 br i1 %8, label %for.cond.cleanup, label %vector.body
157 for.cond.cleanup: ; preds = %vector.body, %entry
161 define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
163 ; CHECK: @ %bb.0: @ %entry
164 ; CHECK-NEXT: .save {r4, lr}
165 ; CHECK-NEXT: push {r4, lr}
166 ; CHECK-NEXT: cmp r3, #1
168 ; CHECK-NEXT: poplt {r4, pc}
169 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
170 ; CHECK-NEXT: vmov r12, s0
171 ; CHECK-NEXT: dlstp.32 lr, r3
172 ; CHECK-NEXT: .LBB3_2: @ %vector.body
173 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
174 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
175 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
176 ; CHECK-NEXT: vfma.f32 q1, q0, r12
177 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
178 ; CHECK-NEXT: letp lr, .LBB3_2
179 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
180 ; CHECK-NEXT: pop {r4, pc}
182 %cmp8 = icmp sgt i32 %n, 0
183 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
185 vector.ph: ; preds = %entry
186 %n.rnd.up = add i32 %n, 3
187 %n.vec = and i32 %n.rnd.up, -4
188 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
189 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
190 br label %vector.body
192 vector.body: ; preds = %vector.body, %vector.ph
193 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
194 %0 = getelementptr inbounds float, float* %x, i32 %index
195 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
196 %2 = bitcast float* %0 to <4 x float>*
197 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
198 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
199 %4 = getelementptr inbounds float, float* %y, i32 %index
200 %5 = bitcast float* %4 to <4 x float>*
201 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
202 %6 = fadd fast <4 x float> %3, %wide.masked.load14
203 %7 = getelementptr inbounds float, float* %z, i32 %index
204 %8 = bitcast float* %7 to <4 x float>*
205 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
206 %index.next = add i32 %index, 4
207 %9 = icmp eq i32 %index.next, %n.vec
208 br i1 %9, label %for.cond.cleanup, label %vector.body
210 for.cond.cleanup: ; preds = %vector.body, %entry
214 define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
215 ; CHECK-LABEL: fmss1:
216 ; CHECK: @ %bb.0: @ %entry
217 ; CHECK-NEXT: .save {r4, lr}
218 ; CHECK-NEXT: push {r4, lr}
219 ; CHECK-NEXT: cmp r3, #1
221 ; CHECK-NEXT: poplt {r4, pc}
222 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
223 ; CHECK-NEXT: vmov r12, s0
224 ; CHECK-NEXT: eor r12, r12, #-2147483648
225 ; CHECK-NEXT: dlstp.32 lr, r3
226 ; CHECK-NEXT: .LBB4_2: @ %vector.body
227 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
228 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
229 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
230 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
231 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
232 ; CHECK-NEXT: letp lr, .LBB4_2
233 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
234 ; CHECK-NEXT: pop {r4, pc}
236 %cmp8 = icmp sgt i32 %n, 0
237 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
239 vector.ph: ; preds = %entry
240 %fneg = fneg fast float %a
241 %n.rnd.up = add i32 %n, 3
242 %n.vec = and i32 %n.rnd.up, -4
243 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
244 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
245 br label %vector.body
247 vector.body: ; preds = %vector.body, %vector.ph
248 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
249 %0 = getelementptr inbounds float, float* %x, i32 %index
250 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
251 %2 = bitcast float* %0 to <4 x float>*
252 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
253 %3 = getelementptr inbounds float, float* %y, i32 %index
254 %4 = bitcast float* %3 to <4 x float>*
255 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
256 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
257 %6 = getelementptr inbounds float, float* %z, i32 %index
258 %7 = bitcast float* %6 to <4 x float>*
259 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
260 %index.next = add i32 %index, 4
261 %8 = icmp eq i32 %index.next, %n.vec
262 br i1 %8, label %for.cond.cleanup, label %vector.body
264 for.cond.cleanup: ; preds = %vector.body, %entry
268 define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
269 ; CHECK-LABEL: fmss2:
270 ; CHECK: @ %bb.0: @ %entry
271 ; CHECK-NEXT: .save {r4, lr}
272 ; CHECK-NEXT: push {r4, lr}
273 ; CHECK-NEXT: cmp r3, #1
275 ; CHECK-NEXT: poplt {r4, pc}
276 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
277 ; CHECK-NEXT: vmov r12, s0
278 ; CHECK-NEXT: vdup.32 q0, r12
279 ; CHECK-NEXT: vneg.f32 q0, q0
280 ; CHECK-NEXT: dlstp.32 lr, r3
281 ; CHECK-NEXT: .LBB5_2: @ %vector.body
282 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
283 ; CHECK-NEXT: vmov q3, q0
284 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
285 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
286 ; CHECK-NEXT: vfma.f32 q3, q2, q1
287 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
288 ; CHECK-NEXT: letp lr, .LBB5_2
289 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
290 ; CHECK-NEXT: pop {r4, pc}
292 %cmp8 = icmp sgt i32 %n, 0
293 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
295 vector.ph: ; preds = %entry
296 %n.rnd.up = add i32 %n, 3
297 %n.vec = and i32 %n.rnd.up, -4
298 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
299 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
300 br label %vector.body
302 vector.body: ; preds = %vector.body, %vector.ph
303 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
304 %0 = getelementptr inbounds float, float* %x, i32 %index
305 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
306 %2 = bitcast float* %0 to <4 x float>*
307 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
308 %3 = getelementptr inbounds float, float* %y, i32 %index
309 %4 = bitcast float* %3 to <4 x float>*
310 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
311 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
312 %6 = fsub fast <4 x float> %5, %broadcast.splat14
313 %7 = getelementptr inbounds float, float* %z, i32 %index
314 %8 = bitcast float* %7 to <4 x float>*
315 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
316 %index.next = add i32 %index, 4
317 %9 = icmp eq i32 %index.next, %n.vec
318 br i1 %9, label %for.cond.cleanup, label %vector.body
320 for.cond.cleanup: ; preds = %vector.body, %entry
324 define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
325 ; CHECK-LABEL: fmss3:
326 ; CHECK: @ %bb.0: @ %entry
327 ; CHECK-NEXT: .save {r4, lr}
328 ; CHECK-NEXT: push {r4, lr}
329 ; CHECK-NEXT: cmp r3, #1
331 ; CHECK-NEXT: poplt {r4, pc}
332 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
333 ; CHECK-NEXT: vmov r4, s0
334 ; CHECK-NEXT: vdup.32 q0, r4
335 ; CHECK-NEXT: dlstp.32 lr, r3
336 ; CHECK-NEXT: .LBB6_2: @ %vector.body
337 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
338 ; CHECK-NEXT: vmov q3, q0
339 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
340 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
341 ; CHECK-NEXT: vfms.f32 q3, q2, q1
342 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
343 ; CHECK-NEXT: letp lr, .LBB6_2
344 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
345 ; CHECK-NEXT: pop {r4, pc}
347 %cmp8 = icmp sgt i32 %n, 0
348 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
350 vector.ph: ; preds = %entry
351 %n.rnd.up = add i32 %n, 3
352 %n.vec = and i32 %n.rnd.up, -4
353 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
354 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
355 br label %vector.body
357 vector.body: ; preds = %vector.body, %vector.ph
358 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
359 %0 = getelementptr inbounds float, float* %x, i32 %index
360 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
361 %2 = bitcast float* %0 to <4 x float>*
362 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
363 %3 = getelementptr inbounds float, float* %y, i32 %index
364 %4 = bitcast float* %3 to <4 x float>*
365 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
366 %5 = fneg fast <4 x float> %wide.masked.load12
367 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14)
368 %7 = getelementptr inbounds float, float* %z, i32 %index
369 %8 = bitcast float* %7 to <4 x float>*
370 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
371 %index.next = add i32 %index, 4
372 %9 = icmp eq i32 %index.next, %n.vec
373 br i1 %9, label %for.cond.cleanup, label %vector.body
375 for.cond.cleanup: ; preds = %vector.body, %entry
379 define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
380 ; CHECK-LABEL: fmss4:
381 ; CHECK: @ %bb.0: @ %entry
382 ; CHECK-NEXT: .save {r4, lr}
383 ; CHECK-NEXT: push {r4, lr}
384 ; CHECK-NEXT: cmp r3, #1
386 ; CHECK-NEXT: poplt {r4, pc}
387 ; CHECK-NEXT: .LBB7_1: @ %vector.ph
388 ; CHECK-NEXT: vmov r4, s0
389 ; CHECK-NEXT: vdup.32 q0, r4
390 ; CHECK-NEXT: dlstp.32 lr, r3
391 ; CHECK-NEXT: .LBB7_2: @ %vector.body
392 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
393 ; CHECK-NEXT: vmov q3, q0
394 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
395 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
396 ; CHECK-NEXT: vfms.f32 q3, q2, q1
397 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
398 ; CHECK-NEXT: letp lr, .LBB7_2
399 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
400 ; CHECK-NEXT: pop {r4, pc}
402 %cmp8 = icmp sgt i32 %n, 0
403 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
405 vector.ph: ; preds = %entry
406 %n.rnd.up = add i32 %n, 3
407 %n.vec = and i32 %n.rnd.up, -4
408 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
409 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
410 br label %vector.body
412 vector.body: ; preds = %vector.body, %vector.ph
413 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
414 %0 = getelementptr inbounds float, float* %x, i32 %index
415 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
416 %2 = bitcast float* %0 to <4 x float>*
417 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
418 %3 = getelementptr inbounds float, float* %y, i32 %index
419 %4 = bitcast float* %3 to <4 x float>*
420 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
421 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
422 %6 = fsub fast <4 x float> %broadcast.splat14, %5
423 %7 = getelementptr inbounds float, float* %z, i32 %index
424 %8 = bitcast float* %7 to <4 x float>*
425 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
426 %index.next = add i32 %index, 4
427 %9 = icmp eq i32 %index.next, %n.vec
428 br i1 %9, label %for.cond.cleanup, label %vector.body
430 for.cond.cleanup: ; preds = %vector.body, %entry
434 define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
436 ; CHECK: @ %bb.0: @ %entry
437 ; CHECK-NEXT: .save {r4, lr}
438 ; CHECK-NEXT: push {r4, lr}
439 ; CHECK-NEXT: cmp r3, #1
441 ; CHECK-NEXT: poplt {r4, pc}
442 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
443 ; CHECK-NEXT: vmov r12, s0
444 ; CHECK-NEXT: eor r12, r12, #-2147483648
445 ; CHECK-NEXT: dlstp.32 lr, r3
446 ; CHECK-NEXT: .LBB8_2: @ %vector.body
447 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
448 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
449 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
450 ; CHECK-NEXT: vfma.f32 q1, q0, r12
451 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
452 ; CHECK-NEXT: letp lr, .LBB8_2
453 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
454 ; CHECK-NEXT: pop {r4, pc}
456 %cmp8 = icmp sgt i32 %n, 0
457 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
459 vector.ph: ; preds = %entry
460 %fneg = fneg fast float %a
461 %n.rnd.up = add i32 %n, 3
462 %n.vec = and i32 %n.rnd.up, -4
463 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
464 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
465 br label %vector.body
467 vector.body: ; preds = %vector.body, %vector.ph
468 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
469 %0 = getelementptr inbounds float, float* %x, i32 %index
470 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
471 %2 = bitcast float* %0 to <4 x float>*
472 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
473 %3 = getelementptr inbounds float, float* %y, i32 %index
474 %4 = bitcast float* %3 to <4 x float>*
475 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
476 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
477 %6 = getelementptr inbounds float, float* %z, i32 %index
478 %7 = bitcast float* %6 to <4 x float>*
479 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1)
480 %index.next = add i32 %index, 4
481 %8 = icmp eq i32 %index.next, %n.vec
482 br i1 %8, label %for.cond.cleanup, label %vector.body
484 for.cond.cleanup: ; preds = %vector.body, %entry
488 define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
490 ; CHECK: @ %bb.0: @ %entry
491 ; CHECK-NEXT: .save {r4, lr}
492 ; CHECK-NEXT: push {r4, lr}
493 ; CHECK-NEXT: cmp r3, #1
495 ; CHECK-NEXT: poplt {r4, pc}
496 ; CHECK-NEXT: .LBB9_1: @ %vector.ph
497 ; CHECK-NEXT: vmov r4, s0
498 ; CHECK-NEXT: vdup.32 q0, r4
499 ; CHECK-NEXT: dlstp.32 lr, r3
500 ; CHECK-NEXT: .LBB9_2: @ %vector.body
501 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
502 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
503 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
504 ; CHECK-NEXT: vfms.f32 q2, q1, q0
505 ; CHECK-NEXT: vstrw.32 q2, [r2], #16
506 ; CHECK-NEXT: letp lr, .LBB9_2
507 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
508 ; CHECK-NEXT: pop {r4, pc}
510 %cmp8 = icmp sgt i32 %n, 0
511 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
513 vector.ph: ; preds = %entry
514 %n.rnd.up = add i32 %n, 3
515 %n.vec = and i32 %n.rnd.up, -4
516 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
517 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
518 br label %vector.body
520 vector.body: ; preds = %vector.body, %vector.ph
521 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
522 %0 = getelementptr inbounds float, float* %x, i32 %index
523 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
524 %2 = bitcast float* %0 to <4 x float>*
525 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
526 %3 = getelementptr inbounds float, float* %y, i32 %index
527 %4 = bitcast float* %3 to <4 x float>*
528 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
529 %5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
530 %6 = fsub fast <4 x float> %wide.masked.load12, %5
531 %7 = getelementptr inbounds float, float* %z, i32 %index
532 %8 = bitcast float* %7 to <4 x float>*
533 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
534 %index.next = add i32 %index, 4
535 %9 = icmp eq i32 %index.next, %n.vec
536 br i1 %9, label %for.cond.cleanup, label %vector.body
538 for.cond.cleanup: ; preds = %vector.body, %entry
542 define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
544 ; CHECK: @ %bb.0: @ %entry
545 ; CHECK-NEXT: .save {r4, lr}
546 ; CHECK-NEXT: push {r4, lr}
547 ; CHECK-NEXT: cmp r3, #1
549 ; CHECK-NEXT: poplt {r4, pc}
550 ; CHECK-NEXT: .LBB10_1: @ %vector.ph
551 ; CHECK-NEXT: vmov r12, s0
552 ; CHECK-NEXT: dlstp.32 lr, r3
553 ; CHECK-NEXT: .LBB10_2: @ %vector.body
554 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
555 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
556 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
557 ; CHECK-NEXT: vneg.f32 q0, q0
558 ; CHECK-NEXT: vfma.f32 q0, q1, r12
559 ; CHECK-NEXT: vstrw.32 q0, [r2], #16
560 ; CHECK-NEXT: letp lr, .LBB10_2
561 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
562 ; CHECK-NEXT: pop {r4, pc}
564 %cmp8 = icmp sgt i32 %n, 0
565 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
567 vector.ph: ; preds = %entry
568 %n.rnd.up = add i32 %n, 3
569 %n.vec = and i32 %n.rnd.up, -4
570 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
571 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
572 br label %vector.body
574 vector.body: ; preds = %vector.body, %vector.ph
575 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
576 %0 = getelementptr inbounds float, float* %x, i32 %index
577 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
578 %2 = bitcast float* %0 to <4 x float>*
579 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
580 %3 = getelementptr inbounds float, float* %y, i32 %index
581 %4 = bitcast float* %3 to <4 x float>*
582 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef)
583 %5 = fneg fast <4 x float> %wide.masked.load12
584 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5)
585 %7 = getelementptr inbounds float, float* %z, i32 %index
586 %8 = bitcast float* %7 to <4 x float>*
587 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
588 %index.next = add i32 %index, 4
589 %9 = icmp eq i32 %index.next, %n.vec
590 br i1 %9, label %for.cond.cleanup, label %vector.body
592 for.cond.cleanup: ; preds = %vector.body, %entry
596 define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
598 ; CHECK: @ %bb.0: @ %entry
599 ; CHECK-NEXT: .save {r4, lr}
600 ; CHECK-NEXT: push {r4, lr}
601 ; CHECK-NEXT: cmp r3, #1
603 ; CHECK-NEXT: poplt {r4, pc}
604 ; CHECK-NEXT: .LBB11_1: @ %vector.ph
605 ; CHECK-NEXT: vmov r12, s0
606 ; CHECK-NEXT: dlstp.32 lr, r3
607 ; CHECK-NEXT: .LBB11_2: @ %vector.body
608 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
609 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
610 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
611 ; CHECK-NEXT: vneg.f32 q0, q0
612 ; CHECK-NEXT: vfma.f32 q0, q1, r12
613 ; CHECK-NEXT: vstrw.32 q0, [r2], #16
614 ; CHECK-NEXT: letp lr, .LBB11_2
615 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
616 ; CHECK-NEXT: pop {r4, pc}
618 %cmp8 = icmp sgt i32 %n, 0
619 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
621 vector.ph: ; preds = %entry
622 %n.rnd.up = add i32 %n, 3
623 %n.vec = and i32 %n.rnd.up, -4
624 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
625 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
626 br label %vector.body
628 vector.body: ; preds = %vector.body, %vector.ph
629 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
630 %0 = getelementptr inbounds float, float* %x, i32 %index
631 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
632 %2 = bitcast float* %0 to <4 x float>*
633 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
634 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
635 %4 = getelementptr inbounds float, float* %y, i32 %index
636 %5 = bitcast float* %4 to <4 x float>*
637 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef)
638 %6 = fsub fast <4 x float> %3, %wide.masked.load14
639 %7 = getelementptr inbounds float, float* %z, i32 %index
640 %8 = bitcast float* %7 to <4 x float>*
641 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1)
642 %index.next = add i32 %index, 4
643 %9 = icmp eq i32 %index.next, %n.vec
644 br i1 %9, label %for.cond.cleanup, label %vector.body
646 for.cond.cleanup: ; preds = %vector.body, %entry
650 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
651 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
652 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
653 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)