1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @fmas1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r4, lr}
8 ; CHECK-NEXT: push {r4, lr}
9 ; CHECK-NEXT: cmp r3, #1
11 ; CHECK-NEXT: poplt {r4, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
13 ; CHECK-NEXT: vmov r12, s0
14 ; CHECK-NEXT: dlstp.32 lr, r3
15 ; CHECK-NEXT: .LBB0_2: @ %vector.body
16 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
17 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
18 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
19 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
20 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
21 ; CHECK-NEXT: letp lr, .LBB0_2
22 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
23 ; CHECK-NEXT: pop {r4, pc}
25 %cmp8 = icmp sgt i32 %n, 0
26 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
28 vector.ph: ; preds = %entry
29 %n.rnd.up = add i32 %n, 3
30 %n.vec = and i32 %n.rnd.up, -4
31 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
32 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
35 vector.body: ; preds = %vector.body, %vector.ph
36 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37 %0 = getelementptr inbounds float, ptr %x, i32 %index
38 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
39 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
40 %2 = getelementptr inbounds float, ptr %y, i32 %index
41 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
42 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
43 %4 = getelementptr inbounds float, ptr %z, i32 %index
44 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
45 %index.next = add i32 %index, 4
46 %5 = icmp eq i32 %index.next, %n.vec
47 br i1 %5, label %for.cond.cleanup, label %vector.body
49 for.cond.cleanup: ; preds = %vector.body, %entry
53 define arm_aapcs_vfpcc void @fmas2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
55 ; CHECK: @ %bb.0: @ %entry
56 ; CHECK-NEXT: .save {r4, lr}
57 ; CHECK-NEXT: push {r4, lr}
58 ; CHECK-NEXT: cmp r3, #1
60 ; CHECK-NEXT: poplt {r4, pc}
61 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
62 ; CHECK-NEXT: vmov r12, s0
63 ; CHECK-NEXT: dlstp.32 lr, r3
64 ; CHECK-NEXT: .LBB1_2: @ %vector.body
65 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
66 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
67 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
68 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
69 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
70 ; CHECK-NEXT: letp lr, .LBB1_2
71 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
72 ; CHECK-NEXT: pop {r4, pc}
74 %cmp8 = icmp sgt i32 %n, 0
75 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
77 vector.ph: ; preds = %entry
78 %n.rnd.up = add i32 %n, 3
79 %n.vec = and i32 %n.rnd.up, -4
80 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
81 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
84 vector.body: ; preds = %vector.body, %vector.ph
85 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
86 %0 = getelementptr inbounds float, ptr %x, i32 %index
87 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
88 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
89 %2 = getelementptr inbounds float, ptr %y, i32 %index
90 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
91 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
92 %4 = fadd fast <4 x float> %3, %broadcast.splat14
93 %5 = getelementptr inbounds float, ptr %z, i32 %index
94 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
95 %index.next = add i32 %index, 4
96 %6 = icmp eq i32 %index.next, %n.vec
97 br i1 %6, label %for.cond.cleanup, label %vector.body
99 for.cond.cleanup: ; preds = %vector.body, %entry
103 define arm_aapcs_vfpcc void @fma1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
105 ; CHECK: @ %bb.0: @ %entry
106 ; CHECK-NEXT: .save {r4, lr}
107 ; CHECK-NEXT: push {r4, lr}
108 ; CHECK-NEXT: cmp r3, #1
110 ; CHECK-NEXT: poplt {r4, pc}
111 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
112 ; CHECK-NEXT: vmov r12, s0
113 ; CHECK-NEXT: dlstp.32 lr, r3
114 ; CHECK-NEXT: .LBB2_2: @ %vector.body
115 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
116 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
117 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
118 ; CHECK-NEXT: vfma.f32 q1, q0, r12
119 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
120 ; CHECK-NEXT: letp lr, .LBB2_2
121 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
122 ; CHECK-NEXT: pop {r4, pc}
124 %cmp8 = icmp sgt i32 %n, 0
125 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
127 vector.ph: ; preds = %entry
128 %n.rnd.up = add i32 %n, 3
129 %n.vec = and i32 %n.rnd.up, -4
130 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
131 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
132 br label %vector.body
134 vector.body: ; preds = %vector.body, %vector.ph
135 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
136 %0 = getelementptr inbounds float, ptr %x, i32 %index
137 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
138 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
139 %2 = getelementptr inbounds float, ptr %y, i32 %index
140 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
141 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
142 %4 = getelementptr inbounds float, ptr %z, i32 %index
143 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
144 %index.next = add i32 %index, 4
145 %5 = icmp eq i32 %index.next, %n.vec
146 br i1 %5, label %for.cond.cleanup, label %vector.body
148 for.cond.cleanup: ; preds = %vector.body, %entry
152 define arm_aapcs_vfpcc void @fma2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
154 ; CHECK: @ %bb.0: @ %entry
155 ; CHECK-NEXT: .save {r4, lr}
156 ; CHECK-NEXT: push {r4, lr}
157 ; CHECK-NEXT: cmp r3, #1
159 ; CHECK-NEXT: poplt {r4, pc}
160 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
161 ; CHECK-NEXT: vmov r12, s0
162 ; CHECK-NEXT: dlstp.32 lr, r3
163 ; CHECK-NEXT: .LBB3_2: @ %vector.body
164 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
165 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
166 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
167 ; CHECK-NEXT: vfma.f32 q1, q0, r12
168 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
169 ; CHECK-NEXT: letp lr, .LBB3_2
170 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
171 ; CHECK-NEXT: pop {r4, pc}
173 %cmp8 = icmp sgt i32 %n, 0
174 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
176 vector.ph: ; preds = %entry
177 %n.rnd.up = add i32 %n, 3
178 %n.vec = and i32 %n.rnd.up, -4
179 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
180 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
181 br label %vector.body
183 vector.body: ; preds = %vector.body, %vector.ph
184 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
185 %0 = getelementptr inbounds float, ptr %x, i32 %index
186 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
187 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
188 %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
189 %3 = getelementptr inbounds float, ptr %y, i32 %index
190 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef)
191 %4 = fadd fast <4 x float> %2, %wide.masked.load14
192 %5 = getelementptr inbounds float, ptr %z, i32 %index
193 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
194 %index.next = add i32 %index, 4
195 %6 = icmp eq i32 %index.next, %n.vec
196 br i1 %6, label %for.cond.cleanup, label %vector.body
198 for.cond.cleanup: ; preds = %vector.body, %entry
202 define arm_aapcs_vfpcc void @fmss1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
203 ; CHECK-LABEL: fmss1:
204 ; CHECK: @ %bb.0: @ %entry
205 ; CHECK-NEXT: .save {r4, lr}
206 ; CHECK-NEXT: push {r4, lr}
207 ; CHECK-NEXT: cmp r3, #1
209 ; CHECK-NEXT: poplt {r4, pc}
210 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
211 ; CHECK-NEXT: vmov r12, s0
212 ; CHECK-NEXT: eor r12, r12, #-2147483648
213 ; CHECK-NEXT: dlstp.32 lr, r3
214 ; CHECK-NEXT: .LBB4_2: @ %vector.body
215 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
216 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
217 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
218 ; CHECK-NEXT: vfmas.f32 q1, q0, r12
219 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
220 ; CHECK-NEXT: letp lr, .LBB4_2
221 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
222 ; CHECK-NEXT: pop {r4, pc}
224 %cmp8 = icmp sgt i32 %n, 0
225 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
227 vector.ph: ; preds = %entry
228 %fneg = fneg fast float %a
229 %n.rnd.up = add i32 %n, 3
230 %n.vec = and i32 %n.rnd.up, -4
231 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
232 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
233 br label %vector.body
235 vector.body: ; preds = %vector.body, %vector.ph
236 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
237 %0 = getelementptr inbounds float, ptr %x, i32 %index
238 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
239 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
240 %2 = getelementptr inbounds float, ptr %y, i32 %index
241 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
242 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
243 %4 = getelementptr inbounds float, ptr %z, i32 %index
244 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
245 %index.next = add i32 %index, 4
246 %5 = icmp eq i32 %index.next, %n.vec
247 br i1 %5, label %for.cond.cleanup, label %vector.body
249 for.cond.cleanup: ; preds = %vector.body, %entry
253 define arm_aapcs_vfpcc void @fmss2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
254 ; CHECK-LABEL: fmss2:
255 ; CHECK: @ %bb.0: @ %entry
256 ; CHECK-NEXT: .save {r4, lr}
257 ; CHECK-NEXT: push {r4, lr}
258 ; CHECK-NEXT: cmp r3, #1
260 ; CHECK-NEXT: poplt {r4, pc}
261 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
262 ; CHECK-NEXT: vmov r12, s0
263 ; CHECK-NEXT: vdup.32 q0, r12
264 ; CHECK-NEXT: vneg.f32 q0, q0
265 ; CHECK-NEXT: dlstp.32 lr, r3
266 ; CHECK-NEXT: .LBB5_2: @ %vector.body
267 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
268 ; CHECK-NEXT: vmov q3, q0
269 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
270 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
271 ; CHECK-NEXT: vfma.f32 q3, q2, q1
272 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
273 ; CHECK-NEXT: letp lr, .LBB5_2
274 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
275 ; CHECK-NEXT: pop {r4, pc}
277 %cmp8 = icmp sgt i32 %n, 0
278 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
280 vector.ph: ; preds = %entry
281 %n.rnd.up = add i32 %n, 3
282 %n.vec = and i32 %n.rnd.up, -4
283 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
284 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
285 br label %vector.body
287 vector.body: ; preds = %vector.body, %vector.ph
288 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
289 %0 = getelementptr inbounds float, ptr %x, i32 %index
290 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
291 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
292 %2 = getelementptr inbounds float, ptr %y, i32 %index
293 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
294 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
295 %4 = fsub fast <4 x float> %3, %broadcast.splat14
296 %5 = getelementptr inbounds float, ptr %z, i32 %index
297 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
298 %index.next = add i32 %index, 4
299 %6 = icmp eq i32 %index.next, %n.vec
300 br i1 %6, label %for.cond.cleanup, label %vector.body
302 for.cond.cleanup: ; preds = %vector.body, %entry
306 define arm_aapcs_vfpcc void @fmss3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
307 ; CHECK-LABEL: fmss3:
308 ; CHECK: @ %bb.0: @ %entry
309 ; CHECK-NEXT: .save {r4, lr}
310 ; CHECK-NEXT: push {r4, lr}
311 ; CHECK-NEXT: cmp r3, #1
313 ; CHECK-NEXT: poplt {r4, pc}
314 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
315 ; CHECK-NEXT: vmov r4, s0
316 ; CHECK-NEXT: vdup.32 q0, r4
317 ; CHECK-NEXT: dlstp.32 lr, r3
318 ; CHECK-NEXT: .LBB6_2: @ %vector.body
319 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
320 ; CHECK-NEXT: vmov q3, q0
321 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
322 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
323 ; CHECK-NEXT: vfms.f32 q3, q2, q1
324 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
325 ; CHECK-NEXT: letp lr, .LBB6_2
326 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
327 ; CHECK-NEXT: pop {r4, pc}
329 %cmp8 = icmp sgt i32 %n, 0
330 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
332 vector.ph: ; preds = %entry
333 %n.rnd.up = add i32 %n, 3
334 %n.vec = and i32 %n.rnd.up, -4
335 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
336 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
337 br label %vector.body
339 vector.body: ; preds = %vector.body, %vector.ph
340 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
341 %0 = getelementptr inbounds float, ptr %x, i32 %index
342 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
343 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
344 %2 = getelementptr inbounds float, ptr %y, i32 %index
345 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
346 %3 = fneg fast <4 x float> %wide.masked.load12
347 %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %3, <4 x float> %broadcast.splat14)
348 %5 = getelementptr inbounds float, ptr %z, i32 %index
349 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
350 %index.next = add i32 %index, 4
351 %6 = icmp eq i32 %index.next, %n.vec
352 br i1 %6, label %for.cond.cleanup, label %vector.body
354 for.cond.cleanup: ; preds = %vector.body, %entry
358 define arm_aapcs_vfpcc void @fmss4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
359 ; CHECK-LABEL: fmss4:
360 ; CHECK: @ %bb.0: @ %entry
361 ; CHECK-NEXT: .save {r4, lr}
362 ; CHECK-NEXT: push {r4, lr}
363 ; CHECK-NEXT: cmp r3, #1
365 ; CHECK-NEXT: poplt {r4, pc}
366 ; CHECK-NEXT: .LBB7_1: @ %vector.ph
367 ; CHECK-NEXT: vmov r4, s0
368 ; CHECK-NEXT: vdup.32 q0, r4
369 ; CHECK-NEXT: dlstp.32 lr, r3
370 ; CHECK-NEXT: .LBB7_2: @ %vector.body
371 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
372 ; CHECK-NEXT: vmov q3, q0
373 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
374 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
375 ; CHECK-NEXT: vfms.f32 q3, q2, q1
376 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
377 ; CHECK-NEXT: letp lr, .LBB7_2
378 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
379 ; CHECK-NEXT: pop {r4, pc}
381 %cmp8 = icmp sgt i32 %n, 0
382 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
384 vector.ph: ; preds = %entry
385 %n.rnd.up = add i32 %n, 3
386 %n.vec = and i32 %n.rnd.up, -4
387 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
388 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
389 br label %vector.body
391 vector.body: ; preds = %vector.body, %vector.ph
392 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
393 %0 = getelementptr inbounds float, ptr %x, i32 %index
394 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
395 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
396 %2 = getelementptr inbounds float, ptr %y, i32 %index
397 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
398 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
399 %4 = fsub fast <4 x float> %broadcast.splat14, %3
400 %5 = getelementptr inbounds float, ptr %z, i32 %index
401 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
402 %index.next = add i32 %index, 4
403 %6 = icmp eq i32 %index.next, %n.vec
404 br i1 %6, label %for.cond.cleanup, label %vector.body
406 for.cond.cleanup: ; preds = %vector.body, %entry
410 define arm_aapcs_vfpcc void @fms1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
412 ; CHECK: @ %bb.0: @ %entry
413 ; CHECK-NEXT: .save {r4, lr}
414 ; CHECK-NEXT: push {r4, lr}
415 ; CHECK-NEXT: cmp r3, #1
417 ; CHECK-NEXT: poplt {r4, pc}
418 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
419 ; CHECK-NEXT: vmov r12, s0
420 ; CHECK-NEXT: eor r12, r12, #-2147483648
421 ; CHECK-NEXT: dlstp.32 lr, r3
422 ; CHECK-NEXT: .LBB8_2: @ %vector.body
423 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
424 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
425 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
426 ; CHECK-NEXT: vfma.f32 q1, q0, r12
427 ; CHECK-NEXT: vstrw.32 q1, [r2], #16
428 ; CHECK-NEXT: letp lr, .LBB8_2
429 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
430 ; CHECK-NEXT: pop {r4, pc}
432 %cmp8 = icmp sgt i32 %n, 0
433 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
435 vector.ph: ; preds = %entry
436 %fneg = fneg fast float %a
437 %n.rnd.up = add i32 %n, 3
438 %n.vec = and i32 %n.rnd.up, -4
439 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
440 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
441 br label %vector.body
443 vector.body: ; preds = %vector.body, %vector.ph
444 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
445 %0 = getelementptr inbounds float, ptr %x, i32 %index
446 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
447 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
448 %2 = getelementptr inbounds float, ptr %y, i32 %index
449 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
450 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
451 %4 = getelementptr inbounds float, ptr %z, i32 %index
452 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
453 %index.next = add i32 %index, 4
454 %5 = icmp eq i32 %index.next, %n.vec
455 br i1 %5, label %for.cond.cleanup, label %vector.body
457 for.cond.cleanup: ; preds = %vector.body, %entry
461 define arm_aapcs_vfpcc void @fms2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
463 ; CHECK: @ %bb.0: @ %entry
464 ; CHECK-NEXT: .save {r4, lr}
465 ; CHECK-NEXT: push {r4, lr}
466 ; CHECK-NEXT: cmp r3, #1
468 ; CHECK-NEXT: poplt {r4, pc}
469 ; CHECK-NEXT: .LBB9_1: @ %vector.ph
470 ; CHECK-NEXT: vmov r4, s0
471 ; CHECK-NEXT: vdup.32 q0, r4
472 ; CHECK-NEXT: dlstp.32 lr, r3
473 ; CHECK-NEXT: .LBB9_2: @ %vector.body
474 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
475 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
476 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
477 ; CHECK-NEXT: vfms.f32 q2, q1, q0
478 ; CHECK-NEXT: vstrw.32 q2, [r2], #16
479 ; CHECK-NEXT: letp lr, .LBB9_2
480 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
481 ; CHECK-NEXT: pop {r4, pc}
483 %cmp8 = icmp sgt i32 %n, 0
484 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
486 vector.ph: ; preds = %entry
487 %n.rnd.up = add i32 %n, 3
488 %n.vec = and i32 %n.rnd.up, -4
489 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
490 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
491 br label %vector.body
493 vector.body: ; preds = %vector.body, %vector.ph
494 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
495 %0 = getelementptr inbounds float, ptr %x, i32 %index
496 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
497 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
498 %2 = getelementptr inbounds float, ptr %y, i32 %index
499 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
500 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
501 %4 = fsub fast <4 x float> %wide.masked.load12, %3
502 %5 = getelementptr inbounds float, ptr %z, i32 %index
503 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
504 %index.next = add i32 %index, 4
505 %6 = icmp eq i32 %index.next, %n.vec
506 br i1 %6, label %for.cond.cleanup, label %vector.body
508 for.cond.cleanup: ; preds = %vector.body, %entry
512 define arm_aapcs_vfpcc void @fms3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
514 ; CHECK: @ %bb.0: @ %entry
515 ; CHECK-NEXT: .save {r4, lr}
516 ; CHECK-NEXT: push {r4, lr}
517 ; CHECK-NEXT: cmp r3, #1
519 ; CHECK-NEXT: poplt {r4, pc}
520 ; CHECK-NEXT: .LBB10_1: @ %vector.ph
521 ; CHECK-NEXT: vmov r12, s0
522 ; CHECK-NEXT: dlstp.32 lr, r3
523 ; CHECK-NEXT: .LBB10_2: @ %vector.body
524 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
525 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
526 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
527 ; CHECK-NEXT: vneg.f32 q0, q0
528 ; CHECK-NEXT: vfma.f32 q0, q1, r12
529 ; CHECK-NEXT: vstrw.32 q0, [r2], #16
530 ; CHECK-NEXT: letp lr, .LBB10_2
531 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
532 ; CHECK-NEXT: pop {r4, pc}
534 %cmp8 = icmp sgt i32 %n, 0
535 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
537 vector.ph: ; preds = %entry
538 %n.rnd.up = add i32 %n, 3
539 %n.vec = and i32 %n.rnd.up, -4
540 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
541 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
542 br label %vector.body
544 vector.body: ; preds = %vector.body, %vector.ph
545 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
546 %0 = getelementptr inbounds float, ptr %x, i32 %index
547 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
548 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
549 %2 = getelementptr inbounds float, ptr %y, i32 %index
550 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
551 %3 = fneg fast <4 x float> %wide.masked.load12
552 %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %3)
553 %5 = getelementptr inbounds float, ptr %z, i32 %index
554 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
555 %index.next = add i32 %index, 4
556 %6 = icmp eq i32 %index.next, %n.vec
557 br i1 %6, label %for.cond.cleanup, label %vector.body
559 for.cond.cleanup: ; preds = %vector.body, %entry
563 define arm_aapcs_vfpcc void @fms4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
565 ; CHECK: @ %bb.0: @ %entry
566 ; CHECK-NEXT: .save {r4, lr}
567 ; CHECK-NEXT: push {r4, lr}
568 ; CHECK-NEXT: cmp r3, #1
570 ; CHECK-NEXT: poplt {r4, pc}
571 ; CHECK-NEXT: .LBB11_1: @ %vector.ph
572 ; CHECK-NEXT: vmov r12, s0
573 ; CHECK-NEXT: dlstp.32 lr, r3
574 ; CHECK-NEXT: .LBB11_2: @ %vector.body
575 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
576 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
577 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
578 ; CHECK-NEXT: vneg.f32 q0, q0
579 ; CHECK-NEXT: vfma.f32 q0, q1, r12
580 ; CHECK-NEXT: vstrw.32 q0, [r2], #16
581 ; CHECK-NEXT: letp lr, .LBB11_2
582 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
583 ; CHECK-NEXT: pop {r4, pc}
585 %cmp8 = icmp sgt i32 %n, 0
586 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
588 vector.ph: ; preds = %entry
589 %n.rnd.up = add i32 %n, 3
590 %n.vec = and i32 %n.rnd.up, -4
591 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
592 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
593 br label %vector.body
595 vector.body: ; preds = %vector.body, %vector.ph
596 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
597 %0 = getelementptr inbounds float, ptr %x, i32 %index
598 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
599 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
600 %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
601 %3 = getelementptr inbounds float, ptr %y, i32 %index
602 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef)
603 %4 = fsub fast <4 x float> %2, %wide.masked.load14
604 %5 = getelementptr inbounds float, ptr %z, i32 %index
605 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
606 %index.next = add i32 %index, 4
607 %6 = icmp eq i32 %index.next, %n.vec
608 br i1 %6, label %for.cond.cleanup, label %vector.body
610 for.cond.cleanup: ; preds = %vector.body, %entry
614 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
615 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
616 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
617 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)