1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 %struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
6 define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
7 ; CHECK-LABEL: DCT_mve1:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
10 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
11 ; CHECK-NEXT: ldr r3, [r0, #4]
12 ; CHECK-NEXT: sub.w r12, r3, #1
13 ; CHECK-NEXT: cmp.w r12, #2
14 ; CHECK-NEXT: blo .LBB0_5
15 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
16 ; CHECK-NEXT: ldr r5, [r0, #8]
17 ; CHECK-NEXT: ldr r3, [r0]
18 ; CHECK-NEXT: add.w r3, r3, r5, lsl #2
19 ; CHECK-NEXT: movs r0, #1
20 ; CHECK-NEXT: lsl.w r9, r5, #2
21 ; CHECK-NEXT: .LBB0_2: @ %for.body
22 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
23 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
24 ; CHECK-NEXT: vmov.i32 q0, #0x0
25 ; CHECK-NEXT: mov r6, r1
26 ; CHECK-NEXT: mov r7, r3
27 ; CHECK-NEXT: dlstp.32 lr, r5
28 ; CHECK-NEXT: .LBB0_3: @ %vector.body
29 ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
30 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
31 ; CHECK-NEXT: vldrw.u32 q1, [r6], #16
32 ; CHECK-NEXT: vldrw.u32 q2, [r7], #16
33 ; CHECK-NEXT: vfma.f32 q0, q2, q1
34 ; CHECK-NEXT: letp lr, .LBB0_3
35 ; CHECK-NEXT: @ %bb.4: @ %middle.block
36 ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
37 ; CHECK-NEXT: vadd.f32 s2, s2, s3
38 ; CHECK-NEXT: add.w r7, r2, r0, lsl #2
39 ; CHECK-NEXT: vadd.f32 s0, s0, s1
40 ; CHECK-NEXT: adds r0, #1
41 ; CHECK-NEXT: add r3, r9
42 ; CHECK-NEXT: cmp r0, r12
43 ; CHECK-NEXT: vadd.f32 s0, s0, s2
44 ; CHECK-NEXT: vstr s0, [r7]
45 ; CHECK-NEXT: bne .LBB0_2
46 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
47 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
49 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
50 %0 = load i32, i32* %NumInputs, align 4
51 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
52 %1 = load i32, i32* %NumFilters, align 4
53 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
54 %2 = load float*, float** %pDCTCoefs, align 4
55 %cmp = icmp ugt i32 %0, 1
56 tail call void @llvm.assume(i1 %cmp)
58 %cmp350 = icmp ugt i32 %sub, 1
59 br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
61 for.body.preheader: ; preds = %entry
62 %n.rnd.up = add i32 %0, 3
63 %n.vec = and i32 %n.rnd.up, -4
66 for.cond.cleanup: ; preds = %middle.block, %entry
69 for.body: ; preds = %for.body.preheader, %middle.block
70 %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
71 %mul4 = mul i32 %k2.051, %0
74 vector.body: ; preds = %vector.body, %for.body
75 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
76 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
77 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
78 %3 = getelementptr inbounds float, float* %pIn, i32 %index
79 %4 = bitcast float* %3 to <4 x float>*
80 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
81 %5 = add i32 %index, %mul4
82 %6 = getelementptr inbounds float, float* %2, i32 %5
83 %7 = bitcast float* %6 to <4 x float>*
84 %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
85 %8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
86 %9 = fadd fast <4 x float> %8, %vec.phi
87 %10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
88 %index.next = add i32 %index, 4
89 %11 = icmp eq i32 %index.next, %n.vec
90 br i1 %11, label %middle.block, label %vector.body
92 middle.block: ; preds = %vector.body
93 %12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
94 %arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
95 store float %12, float* %arrayidx14, align 4
96 %add16 = add nuw i32 %k2.051, 1
97 %exitcond52.not = icmp eq i32 %add16, %sub
98 br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
101 define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
102 ; CHECK-LABEL: DCT_mve2:
103 ; CHECK: @ %bb.0: @ %entry
104 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
105 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
106 ; CHECK-NEXT: .pad #4
107 ; CHECK-NEXT: sub sp, #4
108 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
109 ; CHECK-NEXT: ldr r1, [r0, #4]
110 ; CHECK-NEXT: subs r1, #2
111 ; CHECK-NEXT: cmp r1, #2
112 ; CHECK-NEXT: blo .LBB1_5
113 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
114 ; CHECK-NEXT: ldr.w r12, [r0, #8]
115 ; CHECK-NEXT: movs r4, #1
116 ; CHECK-NEXT: ldr r3, [r0]
117 ; CHECK-NEXT: add.w r11, r3, r12, lsl #2
118 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3
119 ; CHECK-NEXT: lsl.w r9, r12, #3
120 ; CHECK-NEXT: .LBB1_2: @ %for.body
121 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
122 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
123 ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
124 ; CHECK-NEXT: vmov.i32 q0, #0x0
125 ; CHECK-NEXT: add.w r10, r4, #1
126 ; CHECK-NEXT: mov r3, r11
127 ; CHECK-NEXT: mov r0, r7
128 ; CHECK-NEXT: vmov q1, q0
129 ; CHECK-NEXT: dlstp.32 lr, r12
130 ; CHECK-NEXT: .LBB1_3: @ %vector.body
131 ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
132 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
133 ; CHECK-NEXT: vldrw.u32 q2, [r5], #16
134 ; CHECK-NEXT: vldrw.u32 q3, [r3], #16
135 ; CHECK-NEXT: vfma.f32 q1, q3, q2
136 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16
137 ; CHECK-NEXT: vfma.f32 q0, q3, q2
138 ; CHECK-NEXT: letp lr, .LBB1_3
139 ; CHECK-NEXT: @ %bb.4: @ %middle.block
140 ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
141 ; CHECK-NEXT: vadd.f32 s2, s2, s3
142 ; CHECK-NEXT: add.w r0, r2, r10, lsl #2
143 ; CHECK-NEXT: vadd.f32 s0, s0, s1
144 ; CHECK-NEXT: add r11, r9
145 ; CHECK-NEXT: vadd.f32 s6, s6, s7
146 ; CHECK-NEXT: add r7, r9
147 ; CHECK-NEXT: vadd.f32 s4, s4, s5
148 ; CHECK-NEXT: vadd.f32 s0, s0, s2
149 ; CHECK-NEXT: vadd.f32 s2, s4, s6
150 ; CHECK-NEXT: vstr s0, [r0]
151 ; CHECK-NEXT: add.w r0, r2, r4, lsl #2
152 ; CHECK-NEXT: adds r4, #2
153 ; CHECK-NEXT: cmp r4, r1
154 ; CHECK-NEXT: vstr s2, [r0]
155 ; CHECK-NEXT: blo .LBB1_2
156 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
157 ; CHECK-NEXT: add sp, #4
158 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
160 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
161 %0 = load i32, i32* %NumInputs, align 4
162 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
163 %1 = load i32, i32* %NumFilters, align 4
164 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
165 %2 = load float*, float** %pDCTCoefs, align 4
166 %cmp = icmp ugt i32 %0, 1
167 tail call void @llvm.assume(i1 %cmp)
168 %sub = add i32 %1, -2
169 %cmp371 = icmp ugt i32 %sub, 1
170 br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
172 for.body.preheader: ; preds = %entry
173 %n.rnd.up = add i32 %0, 3
174 %n.vec = and i32 %n.rnd.up, -4
177 for.cond.cleanup: ; preds = %middle.block, %entry
180 for.body: ; preds = %for.body.preheader, %middle.block
181 %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
182 %mul4 = mul i32 %k2.072, %0
183 %add = add nuw i32 %k2.072, 1
184 %mul5 = mul i32 %add, %0
185 br label %vector.body
187 vector.body: ; preds = %vector.body, %for.body
188 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
189 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
190 %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
191 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
192 %3 = getelementptr inbounds float, float* %pIn, i32 %index
193 %4 = bitcast float* %3 to <4 x float>*
194 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
195 %5 = add i32 %index, %mul4
196 %6 = getelementptr inbounds float, float* %2, i32 %5
197 %7 = bitcast float* %6 to <4 x float>*
198 %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
199 %8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
200 %9 = fadd fast <4 x float> %8, %vec.phi73
201 %10 = add i32 %index, %mul5
202 %11 = getelementptr inbounds float, float* %2, i32 %10
203 %12 = bitcast float* %11 to <4 x float>*
204 %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
205 %13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
206 %14 = fadd fast <4 x float> %13, %vec.phi
207 %15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
208 %16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
209 %index.next = add i32 %index, 4
210 %17 = icmp eq i32 %index.next, %n.vec
211 br i1 %17, label %middle.block, label %vector.body
213 middle.block: ; preds = %vector.body
214 %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
215 %19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
216 %arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
217 store float %18, float* %arrayidx21, align 4
218 %arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
219 store float %19, float* %arrayidx23, align 4
220 %add25 = add i32 %k2.072, 2
221 %cmp3 = icmp ult i32 %add25, %sub
222 br i1 %cmp3, label %for.body, label %for.cond.cleanup
225 define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
226 ; CHECK-LABEL: DCT_mve3:
227 ; CHECK: @ %bb.0: @ %entry
228 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
229 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
230 ; CHECK-NEXT: .pad #4
231 ; CHECK-NEXT: sub sp, #4
232 ; CHECK-NEXT: .vsave {d8, d9}
233 ; CHECK-NEXT: vpush {d8, d9}
234 ; CHECK-NEXT: .pad #24
235 ; CHECK-NEXT: sub sp, #24
236 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
237 ; CHECK-NEXT: ldr r1, [r0, #4]
238 ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
239 ; CHECK-NEXT: subs r1, #3
240 ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
241 ; CHECK-NEXT: cmp r1, #2
242 ; CHECK-NEXT: blo .LBB2_5
243 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
244 ; CHECK-NEXT: ldr r3, [r0, #8]
245 ; CHECK-NEXT: movs r5, #1
246 ; CHECK-NEXT: ldr r1, [r0]
247 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
248 ; CHECK-NEXT: add.w r0, r3, r3, lsl #1
249 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2
250 ; CHECK-NEXT: add.w r12, r1, r3, lsl #3
251 ; CHECK-NEXT: adds r3, #3
252 ; CHECK-NEXT: bic r3, r3, #3
253 ; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
254 ; CHECK-NEXT: add.w r10, r1, r0, lsl #2
255 ; CHECK-NEXT: subs r3, #4
256 ; CHECK-NEXT: lsl.w r11, r0, #2
257 ; CHECK-NEXT: add.w r1, r5, r3, lsr #2
258 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
259 ; CHECK-NEXT: .LBB2_2: @ %for.body
260 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
261 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
262 ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
263 ; CHECK-NEXT: vmov.i32 q0, #0x0
264 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
265 ; CHECK-NEXT: adds r0, r5, #2
266 ; CHECK-NEXT: adds r2, r5, #1
267 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
268 ; CHECK-NEXT: mov r3, r9
269 ; CHECK-NEXT: mov r0, r12
270 ; CHECK-NEXT: mov r4, r10
271 ; CHECK-NEXT: vmov q2, q0
272 ; CHECK-NEXT: vmov q1, q0
273 ; CHECK-NEXT: dlstp.32 lr, r7
274 ; CHECK-NEXT: .LBB2_3: @ %vector.body
275 ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
276 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
277 ; CHECK-NEXT: vldrw.u32 q3, [r6], #16
278 ; CHECK-NEXT: vldrw.u32 q4, [r3], #16
279 ; CHECK-NEXT: vfma.f32 q1, q4, q3
280 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16
281 ; CHECK-NEXT: vfma.f32 q2, q4, q3
282 ; CHECK-NEXT: vldrw.u32 q4, [r4], #16
283 ; CHECK-NEXT: vfma.f32 q0, q4, q3
284 ; CHECK-NEXT: letp lr, .LBB2_3
285 ; CHECK-NEXT: @ %bb.4: @ %middle.block
286 ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
287 ; CHECK-NEXT: vadd.f32 s10, s10, s11
288 ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
289 ; CHECK-NEXT: vadd.f32 s8, s8, s9
290 ; CHECK-NEXT: add r9, r11
291 ; CHECK-NEXT: vadd.f32 s6, s6, s7
292 ; CHECK-NEXT: add.w r0, r1, r2, lsl #2
293 ; CHECK-NEXT: vadd.f32 s4, s4, s5
294 ; CHECK-NEXT: add r12, r11
295 ; CHECK-NEXT: vadd.f32 s2, s2, s3
296 ; CHECK-NEXT: add r10, r11
297 ; CHECK-NEXT: vadd.f32 s0, s0, s1
298 ; CHECK-NEXT: vadd.f32 s8, s8, s10
299 ; CHECK-NEXT: vadd.f32 s4, s4, s6
300 ; CHECK-NEXT: vadd.f32 s0, s0, s2
301 ; CHECK-NEXT: vstr s8, [r0]
302 ; CHECK-NEXT: add.w r0, r1, r5, lsl #2
303 ; CHECK-NEXT: adds r5, #3
304 ; CHECK-NEXT: vstr s4, [r0]
305 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
306 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
307 ; CHECK-NEXT: vstr s0, [r0]
308 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
309 ; CHECK-NEXT: cmp r5, r0
310 ; CHECK-NEXT: blo .LBB2_2
311 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
312 ; CHECK-NEXT: add sp, #24
313 ; CHECK-NEXT: vpop {d8, d9}
314 ; CHECK-NEXT: add sp, #4
315 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
317 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
318 %0 = load i32, i32* %NumInputs, align 4
319 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
320 %1 = load i32, i32* %NumFilters, align 4
321 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
322 %2 = load float*, float** %pDCTCoefs, align 4
323 %cmp = icmp ugt i32 %0, 1
324 tail call void @llvm.assume(i1 %cmp)
325 %sub = add i32 %1, -3
326 %cmp392 = icmp ugt i32 %sub, 1
327 br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
329 for.body.preheader: ; preds = %entry
330 %n.rnd.up = add i32 %0, 3
331 %n.vec = and i32 %n.rnd.up, -4
334 for.cond.cleanup: ; preds = %middle.block, %entry
337 for.body: ; preds = %for.body.preheader, %middle.block
338 %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
339 %mul4 = mul i32 %k2.093, %0
340 %add = add nuw i32 %k2.093, 1
341 %mul5 = mul i32 %add, %0
342 %add6 = add i32 %k2.093, 2
343 %mul7 = mul i32 %add6, %0
344 br label %vector.body
346 vector.body: ; preds = %vector.body, %for.body
347 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
348 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
349 %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
350 %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
351 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
352 %3 = getelementptr inbounds float, float* %pIn, i32 %index
353 %4 = bitcast float* %3 to <4 x float>*
354 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
355 %5 = add i32 %index, %mul4
356 %6 = getelementptr inbounds float, float* %2, i32 %5
357 %7 = bitcast float* %6 to <4 x float>*
358 %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
359 %8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
360 %9 = fadd fast <4 x float> %8, %vec.phi95
361 %10 = add i32 %index, %mul5
362 %11 = getelementptr inbounds float, float* %2, i32 %10
363 %12 = bitcast float* %11 to <4 x float>*
364 %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
365 %13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
366 %14 = fadd fast <4 x float> %13, %vec.phi94
367 %15 = add i32 %index, %mul7
368 %16 = getelementptr inbounds float, float* %2, i32 %15
369 %17 = bitcast float* %16 to <4 x float>*
370 %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
371 %18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
372 %19 = fadd fast <4 x float> %18, %vec.phi
373 %20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
374 %21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
375 %22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
376 %index.next = add i32 %index, 4
377 %23 = icmp eq i32 %index.next, %n.vec
378 br i1 %23, label %middle.block, label %vector.body
380 middle.block: ; preds = %vector.body
381 %24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
382 %25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
383 %26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
384 %arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
385 store float %24, float* %arrayidx28, align 4
386 %arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
387 store float %25, float* %arrayidx30, align 4
388 %arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
389 store float %26, float* %arrayidx32, align 4
390 %add34 = add i32 %k2.093, 3
391 %cmp3 = icmp ult i32 %add34, %sub
392 br i1 %cmp3, label %for.body, label %for.cond.cleanup
395 define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
396 ; CHECK-LABEL: DCT_mve4:
397 ; CHECK: @ %bb.0: @ %entry
398 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
399 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
400 ; CHECK-NEXT: .pad #4
401 ; CHECK-NEXT: sub sp, #4
402 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
403 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
404 ; CHECK-NEXT: .pad #40
405 ; CHECK-NEXT: sub sp, #40
406 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
407 ; CHECK-NEXT: ldr r1, [r0, #4]
408 ; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
409 ; CHECK-NEXT: subs r1, #4
410 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
411 ; CHECK-NEXT: cmp r1, #2
412 ; CHECK-NEXT: blo.w .LBB3_5
413 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
414 ; CHECK-NEXT: ldr r2, [r0, #8]
415 ; CHECK-NEXT: movs r6, #1
416 ; CHECK-NEXT: ldr r1, [r0]
417 ; CHECK-NEXT: add.w r0, r2, r2, lsl #1
418 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2
419 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3
420 ; CHECK-NEXT: add.w r9, r1, r2, lsl #4
421 ; CHECK-NEXT: add.w r11, r1, r0, lsl #2
422 ; CHECK-NEXT: adds r0, r2, #3
423 ; CHECK-NEXT: bic r0, r0, #3
424 ; CHECK-NEXT: subs r0, #4
425 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2
426 ; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
427 ; CHECK-NEXT: lsls r0, r2, #4
428 ; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
429 ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
430 ; CHECK-NEXT: .LBB3_2: @ %for.body
431 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
432 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
433 ; CHECK-NEXT: adds r0, r6, #3
434 ; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
435 ; CHECK-NEXT: adds r0, r6, #2
436 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
437 ; CHECK-NEXT: vmov.i32 q0, #0x0
438 ; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
439 ; CHECK-NEXT: adds r0, r6, #1
440 ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
441 ; CHECK-NEXT: mov r3, r12
442 ; CHECK-NEXT: mov r0, r8
443 ; CHECK-NEXT: mov r5, r11
444 ; CHECK-NEXT: mov r4, r9
445 ; CHECK-NEXT: vmov q1, q0
446 ; CHECK-NEXT: vmov q2, q0
447 ; CHECK-NEXT: vmov q3, q0
448 ; CHECK-NEXT: dlstp.32 lr, r7
449 ; CHECK-NEXT: .LBB3_3: @ %vector.body
450 ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
451 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
452 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16
453 ; CHECK-NEXT: vldrw.u32 q5, [r0], #16
454 ; CHECK-NEXT: vfma.f32 q3, q5, q4
455 ; CHECK-NEXT: vldrw.u32 q5, [r3], #16
456 ; CHECK-NEXT: vfma.f32 q2, q5, q4
457 ; CHECK-NEXT: vldrw.u32 q5, [r5], #16
458 ; CHECK-NEXT: vfma.f32 q1, q5, q4
459 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16
460 ; CHECK-NEXT: vfma.f32 q0, q5, q4
461 ; CHECK-NEXT: letp lr, .LBB3_3
462 ; CHECK-NEXT: @ %bb.4: @ %middle.block
463 ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
464 ; CHECK-NEXT: vadd.f32 s14, s14, s15
465 ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
466 ; CHECK-NEXT: vadd.f32 s12, s12, s13
467 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
468 ; CHECK-NEXT: vadd.f32 s10, s10, s11
469 ; CHECK-NEXT: vadd.f32 s8, s8, s9
470 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
471 ; CHECK-NEXT: vadd.f32 s6, s6, s7
472 ; CHECK-NEXT: vadd.f32 s4, s4, s5
473 ; CHECK-NEXT: vadd.f32 s2, s2, s3
474 ; CHECK-NEXT: vadd.f32 s0, s0, s1
475 ; CHECK-NEXT: vadd.f32 s12, s12, s14
476 ; CHECK-NEXT: vadd.f32 s8, s8, s10
477 ; CHECK-NEXT: vadd.f32 s4, s4, s6
478 ; CHECK-NEXT: vadd.f32 s0, s0, s2
479 ; CHECK-NEXT: vstr s12, [r0]
480 ; CHECK-NEXT: add.w r0, r1, r6, lsl #2
481 ; CHECK-NEXT: adds r6, #4
482 ; CHECK-NEXT: vstr s8, [r0]
483 ; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
484 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
485 ; CHECK-NEXT: vstr s4, [r0]
486 ; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
487 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
488 ; CHECK-NEXT: vstr s0, [r0]
489 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
490 ; CHECK-NEXT: add r12, r0
491 ; CHECK-NEXT: add r8, r0
492 ; CHECK-NEXT: add r11, r0
493 ; CHECK-NEXT: add r9, r0
494 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
495 ; CHECK-NEXT: cmp r6, r0
496 ; CHECK-NEXT: blo .LBB3_2
497 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
498 ; CHECK-NEXT: add sp, #40
499 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
500 ; CHECK-NEXT: add sp, #4
501 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
503 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
504 %0 = load i32, i32* %NumInputs, align 4
505 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
506 %1 = load i32, i32* %NumFilters, align 4
507 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
508 %2 = load float*, float** %pDCTCoefs, align 4
509 %cmp = icmp ugt i32 %0, 1
510 tail call void @llvm.assume(i1 %cmp)
511 %sub = add i32 %1, -4
512 %cmp3113 = icmp ugt i32 %sub, 1
513 br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
515 for.body.preheader: ; preds = %entry
516 %n.rnd.up = add i32 %0, 3
517 %n.vec = and i32 %n.rnd.up, -4
520 for.cond.cleanup: ; preds = %middle.block, %entry
523 for.body: ; preds = %for.body.preheader, %middle.block
524 %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
525 %mul4 = mul i32 %k2.0114, %0
526 %add = add nuw nsw i32 %k2.0114, 1
527 %mul5 = mul i32 %add, %0
528 %add6 = add nuw nsw i32 %k2.0114, 2
529 %mul7 = mul i32 %add6, %0
530 %add8 = add i32 %k2.0114, 3
531 %mul9 = mul i32 %add8, %0
532 br label %vector.body
534 vector.body: ; preds = %vector.body, %for.body
535 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
536 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
537 %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
538 %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
539 %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
540 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
541 %3 = getelementptr inbounds float, float* %pIn, i32 %index
542 %4 = bitcast float* %3 to <4 x float>*
543 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
544 %5 = add i32 %index, %mul4
545 %6 = getelementptr inbounds float, float* %2, i32 %5
546 %7 = bitcast float* %6 to <4 x float>*
547 %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
548 %8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
549 %9 = fadd fast <4 x float> %8, %vec.phi116
550 %10 = add i32 %index, %mul5
551 %11 = getelementptr inbounds float, float* %2, i32 %10
552 %12 = bitcast float* %11 to <4 x float>*
553 %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
554 %13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
555 %14 = fadd fast <4 x float> %13, %vec.phi117
556 %15 = add i32 %index, %mul7
557 %16 = getelementptr inbounds float, float* %2, i32 %15
558 %17 = bitcast float* %16 to <4 x float>*
559 %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
560 %18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
561 %19 = fadd fast <4 x float> %18, %vec.phi115
562 %20 = add i32 %index, %mul9
563 %21 = getelementptr inbounds float, float* %2, i32 %20
564 %22 = bitcast float* %21 to <4 x float>*
565 %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
566 %23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
567 %24 = fadd fast <4 x float> %23, %vec.phi
568 %25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
569 %26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
570 %27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
571 %28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
572 %index.next = add i32 %index, 4
573 %29 = icmp eq i32 %index.next, %n.vec
574 br i1 %29, label %middle.block, label %vector.body
576 middle.block: ; preds = %vector.body
577 %30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
578 %31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
579 %32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
580 %33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
581 %arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
582 store float %31, float* %arrayidx35, align 4
583 %arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
584 store float %30, float* %arrayidx37, align 4
585 %arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
586 store float %32, float* %arrayidx39, align 4
587 %arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
588 store float %33, float* %arrayidx41, align 4
589 %add43 = add i32 %k2.0114, 4
590 %cmp3 = icmp ult i32 %add43, %sub
591 br i1 %cmp3, label %for.body, label %for.cond.cleanup
594 define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
595 ; CHECK-LABEL: DCT_mve5:
596 ; CHECK: @ %bb.0: @ %entry
597 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
598 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
599 ; CHECK-NEXT: .pad #4
600 ; CHECK-NEXT: sub sp, #4
601 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
602 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
603 ; CHECK-NEXT: .pad #32
604 ; CHECK-NEXT: sub sp, #32
605 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
606 ; CHECK-NEXT: ldr r1, [r0, #4]
607 ; CHECK-NEXT: subs r1, #5
608 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
609 ; CHECK-NEXT: cmp r1, #2
610 ; CHECK-NEXT: blo.w .LBB4_5
611 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
612 ; CHECK-NEXT: ldr r3, [r0, #8]
613 ; CHECK-NEXT: ldr r1, [r0]
614 ; CHECK-NEXT: adds r0, r3, #3
615 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
616 ; CHECK-NEXT: bic r0, r0, #3
617 ; CHECK-NEXT: add.w r8, r1, r3, lsl #2
618 ; CHECK-NEXT: subs r1, r0, #4
619 ; CHECK-NEXT: movs r0, #1
620 ; CHECK-NEXT: lsls r5, r3, #2
621 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
622 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
623 ; CHECK-NEXT: add.w r1, r3, r3, lsl #2
624 ; CHECK-NEXT: lsls r1, r1, #2
625 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
626 ; CHECK-NEXT: .LBB4_2: @ %for.body
627 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
628 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
629 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
630 ; CHECK-NEXT: adds r1, r0, #4
631 ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
632 ; CHECK-NEXT: vmov.i32 q1, #0x0
633 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
634 ; CHECK-NEXT: add.w r10, r0, #2
635 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
636 ; CHECK-NEXT: adds r1, r0, #3
637 ; CHECK-NEXT: add.w r11, r0, #1
638 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
639 ; CHECK-NEXT: mov r3, r8
640 ; CHECK-NEXT: vmov q0, q1
641 ; CHECK-NEXT: vmov q3, q1
642 ; CHECK-NEXT: vmov q2, q1
643 ; CHECK-NEXT: vmov q4, q1
644 ; CHECK-NEXT: dlstp.32 lr, r7
645 ; CHECK-NEXT: .LBB4_3: @ %vector.body
646 ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
647 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
648 ; CHECK-NEXT: add.w r9, r3, r5
649 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16
650 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16
651 ; CHECK-NEXT: vfma.f32 q3, q6, q5
652 ; CHECK-NEXT: add.w r12, r9, r5
653 ; CHECK-NEXT: vldrw.u32 q6, [r9]
654 ; CHECK-NEXT: vfma.f32 q4, q6, q5
655 ; CHECK-NEXT: add.w r6, r12, r5
656 ; CHECK-NEXT: vldrw.u32 q6, [r12]
657 ; CHECK-NEXT: vfma.f32 q2, q6, q5
658 ; CHECK-NEXT: adds r7, r6, r5
659 ; CHECK-NEXT: vldrw.u32 q6, [r6]
660 ; CHECK-NEXT: vfma.f32 q0, q6, q5
661 ; CHECK-NEXT: vldrw.u32 q6, [r7]
662 ; CHECK-NEXT: vfma.f32 q1, q6, q5
663 ; CHECK-NEXT: letp lr, .LBB4_3
664 ; CHECK-NEXT: @ %bb.4: @ %middle.block
665 ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
666 ; CHECK-NEXT: vadd.f32 s18, s18, s19
667 ; CHECK-NEXT: add.w r1, r2, r11, lsl #2
668 ; CHECK-NEXT: vadd.f32 s16, s16, s17
669 ; CHECK-NEXT: vadd.f32 s14, s14, s15
670 ; CHECK-NEXT: vadd.f32 s12, s12, s13
671 ; CHECK-NEXT: vadd.f32 s6, s6, s7
672 ; CHECK-NEXT: vadd.f32 s4, s4, s5
673 ; CHECK-NEXT: vadd.f32 s10, s10, s11
674 ; CHECK-NEXT: vadd.f32 s8, s8, s9
675 ; CHECK-NEXT: vadd.f32 s0, s0, s1
676 ; CHECK-NEXT: vadd.f32 s1, s16, s18
677 ; CHECK-NEXT: vadd.f32 s2, s2, s3
678 ; CHECK-NEXT: vadd.f32 s12, s12, s14
679 ; CHECK-NEXT: vadd.f32 s4, s4, s6
680 ; CHECK-NEXT: vadd.f32 s6, s8, s10
681 ; CHECK-NEXT: vstr s1, [r1]
682 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
683 ; CHECK-NEXT: vadd.f32 s0, s0, s2
684 ; CHECK-NEXT: adds r0, #5
685 ; CHECK-NEXT: vstr s12, [r1]
686 ; CHECK-NEXT: add.w r1, r2, r10, lsl #2
687 ; CHECK-NEXT: vstr s6, [r1]
688 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
689 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
690 ; CHECK-NEXT: vstr s0, [r1]
691 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
692 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
693 ; CHECK-NEXT: vstr s4, [r1]
694 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
695 ; CHECK-NEXT: add r8, r1
696 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
697 ; CHECK-NEXT: cmp r0, r1
698 ; CHECK-NEXT: blo.w .LBB4_2
699 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
700 ; CHECK-NEXT: add sp, #32
701 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
702 ; CHECK-NEXT: add sp, #4
703 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
705 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
706 %0 = load i32, i32* %NumInputs, align 4
707 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
708 %1 = load i32, i32* %NumFilters, align 4
709 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
710 %2 = load float*, float** %pDCTCoefs, align 4
711 %cmp = icmp ugt i32 %0, 1
712 tail call void @llvm.assume(i1 %cmp)
713 %sub = add i32 %1, -5
714 %cmp3134 = icmp ugt i32 %sub, 1
715 br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
717 for.body.preheader: ; preds = %entry
718 %n.rnd.up = add i32 %0, 3
719 %n.vec = and i32 %n.rnd.up, -4
722 for.cond.cleanup: ; preds = %middle.block, %entry
725 for.body: ; preds = %for.body.preheader, %middle.block
726 %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
727 %mul4 = mul i32 %k2.0135, %0
728 %add = add nuw i32 %k2.0135, 1
729 %mul5 = mul i32 %add, %0
730 %add6 = add i32 %k2.0135, 2
731 %mul7 = mul i32 %add6, %0
732 %add8 = add i32 %k2.0135, 3
733 %mul9 = mul i32 %add8, %0
734 %add10 = add i32 %k2.0135, 4
735 %mul11 = mul i32 %add10, %0
736 br label %vector.body
738 vector.body: ; preds = %vector.body, %for.body
739 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
740 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
741 %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
742 %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
743 %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
744 %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
745 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
746 %3 = getelementptr inbounds float, float* %pIn, i32 %index
747 %4 = bitcast float* %3 to <4 x float>*
748 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
749 %5 = add i32 %index, %mul4
750 %6 = getelementptr inbounds float, float* %2, i32 %5
751 %7 = bitcast float* %6 to <4 x float>*
752 %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
753 %8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
754 %9 = fadd fast <4 x float> %8, %vec.phi137
755 %10 = add i32 %index, %mul5
756 %11 = getelementptr inbounds float, float* %2, i32 %10
757 %12 = bitcast float* %11 to <4 x float>*
758 %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
759 %13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
760 %14 = fadd fast <4 x float> %13, %vec.phi139
761 %15 = add i32 %index, %mul7
762 %16 = getelementptr inbounds float, float* %2, i32 %15
763 %17 = bitcast float* %16 to <4 x float>*
764 %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
765 %18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
766 %19 = fadd fast <4 x float> %18, %vec.phi138
767 %20 = add i32 %index, %mul9
768 %21 = getelementptr inbounds float, float* %2, i32 %20
769 %22 = bitcast float* %21 to <4 x float>*
770 %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
771 %23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
772 %24 = fadd fast <4 x float> %23, %vec.phi136
773 %25 = add i32 %index, %mul11
774 %26 = getelementptr inbounds float, float* %2, i32 %25
775 %27 = bitcast float* %26 to <4 x float>*
776 %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
777 %28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
778 %29 = fadd fast <4 x float> %28, %vec.phi
779 %30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
780 %31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
781 %32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
782 %33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
783 %34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
784 %index.next = add i32 %index, 4
785 %35 = icmp eq i32 %index.next, %n.vec
786 br i1 %35, label %middle.block, label %vector.body
788 middle.block: ; preds = %vector.body
789 %36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
790 %37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
791 %38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
792 %39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
793 %40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
794 %arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
795 store float %38, float* %arrayidx42, align 4
796 %arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
797 store float %36, float* %arrayidx44, align 4
798 %arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
799 store float %37, float* %arrayidx46, align 4
800 %arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
801 store float %39, float* %arrayidx48, align 4
802 %arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
803 store float %40, float* %arrayidx50, align 4
804 %add52 = add i32 %k2.0135, 5
805 %cmp3 = icmp ult i32 %add52, %sub
806 br i1 %cmp3, label %for.body, label %for.cond.cleanup
809 define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
810 ; CHECK-LABEL: DCT_mve6:
811 ; CHECK: @ %bb.0: @ %entry
812 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
813 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
814 ; CHECK-NEXT: .pad #4
815 ; CHECK-NEXT: sub sp, #4
816 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
817 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
818 ; CHECK-NEXT: .pad #32
819 ; CHECK-NEXT: sub sp, #32
820 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
821 ; CHECK-NEXT: ldr r1, [r0, #4]
822 ; CHECK-NEXT: subs r1, #6
823 ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
824 ; CHECK-NEXT: cmp r1, #2
825 ; CHECK-NEXT: blo.w .LBB5_5
826 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
827 ; CHECK-NEXT: ldr r3, [r0, #8]
828 ; CHECK-NEXT: ldr r1, [r0]
829 ; CHECK-NEXT: adds r0, r3, #3
830 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
831 ; CHECK-NEXT: bic r0, r0, #3
832 ; CHECK-NEXT: add.w r8, r1, r3, lsl #2
833 ; CHECK-NEXT: subs r1, r0, #4
834 ; CHECK-NEXT: movs r0, #1
835 ; CHECK-NEXT: lsls r5, r3, #2
836 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
837 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
838 ; CHECK-NEXT: add.w r1, r3, r3, lsl #1
839 ; CHECK-NEXT: lsls r1, r1, #3
840 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
841 ; CHECK-NEXT: .LBB5_2: @ %for.body
842 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
843 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
844 ; CHECK-NEXT: adds r1, r0, #5
845 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
846 ; CHECK-NEXT: adds r1, r0, #4
847 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
848 ; CHECK-NEXT: adds r1, r0, #3
849 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
850 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
851 ; CHECK-NEXT: vmov.i32 q1, #0x0
852 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
853 ; CHECK-NEXT: add.w r11, r0, #2
854 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
855 ; CHECK-NEXT: adds r4, r0, #1
856 ; CHECK-NEXT: mov r3, r8
857 ; CHECK-NEXT: vmov q3, q1
858 ; CHECK-NEXT: vmov q4, q1
859 ; CHECK-NEXT: vmov q0, q1
860 ; CHECK-NEXT: vmov q5, q1
861 ; CHECK-NEXT: vmov q2, q1
862 ; CHECK-NEXT: dlstp.32 lr, r7
863 ; CHECK-NEXT: .LBB5_3: @ %vector.body
864 ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
865 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
866 ; CHECK-NEXT: add.w r12, r3, r5
867 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16
868 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16
869 ; CHECK-NEXT: vfma.f32 q4, q7, q6
870 ; CHECK-NEXT: add.w r10, r12, r5
871 ; CHECK-NEXT: vldrw.u32 q7, [r12]
872 ; CHECK-NEXT: vfma.f32 q5, q7, q6
873 ; CHECK-NEXT: add.w r6, r10, r5
874 ; CHECK-NEXT: vldrw.u32 q7, [r10]
875 ; CHECK-NEXT: vfma.f32 q2, q7, q6
876 ; CHECK-NEXT: adds r7, r6, r5
877 ; CHECK-NEXT: vldrw.u32 q7, [r6]
878 ; CHECK-NEXT: vfma.f32 q0, q7, q6
879 ; CHECK-NEXT: adds r6, r7, r5
880 ; CHECK-NEXT: vldrw.u32 q7, [r7]
881 ; CHECK-NEXT: vfma.f32 q3, q7, q6
882 ; CHECK-NEXT: vldrw.u32 q7, [r6]
883 ; CHECK-NEXT: vfma.f32 q1, q7, q6
884 ; CHECK-NEXT: letp lr, .LBB5_3
885 ; CHECK-NEXT: @ %bb.4: @ %middle.block
886 ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
887 ; CHECK-NEXT: vadd.f32 s22, s22, s23
888 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
889 ; CHECK-NEXT: vadd.f32 s20, s20, s21
890 ; CHECK-NEXT: vadd.f32 s18, s18, s19
891 ; CHECK-NEXT: vadd.f32 s16, s16, s17
892 ; CHECK-NEXT: vadd.f32 s10, s10, s11
893 ; CHECK-NEXT: vadd.f32 s8, s8, s9
894 ; CHECK-NEXT: vadd.f32 s0, s0, s1
895 ; CHECK-NEXT: vadd.f32 s2, s2, s3
896 ; CHECK-NEXT: vadd.f32 s1, s20, s22
897 ; CHECK-NEXT: vadd.f32 s6, s6, s7
898 ; CHECK-NEXT: vadd.f32 s3, s16, s18
899 ; CHECK-NEXT: vadd.f32 s4, s4, s5
900 ; CHECK-NEXT: vadd.f32 s8, s8, s10
901 ; CHECK-NEXT: vadd.f32 s14, s14, s15
902 ; CHECK-NEXT: vadd.f32 s12, s12, s13
903 ; CHECK-NEXT: vstr s1, [r1]
904 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
905 ; CHECK-NEXT: vadd.f32 s0, s0, s2
906 ; CHECK-NEXT: adds r0, #6
907 ; CHECK-NEXT: vstr s3, [r1]
908 ; CHECK-NEXT: add.w r1, r2, r11, lsl #2
909 ; CHECK-NEXT: vadd.f32 s4, s4, s6
910 ; CHECK-NEXT: vstr s8, [r1]
911 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
912 ; CHECK-NEXT: vadd.f32 s6, s12, s14
913 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
914 ; CHECK-NEXT: vstr s0, [r1]
915 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
916 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
917 ; CHECK-NEXT: vstr s6, [r1]
918 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
919 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
920 ; CHECK-NEXT: vstr s4, [r1]
921 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
922 ; CHECK-NEXT: add r8, r1
923 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
924 ; CHECK-NEXT: cmp r0, r1
925 ; CHECK-NEXT: blo.w .LBB5_2
926 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
927 ; CHECK-NEXT: add sp, #32
928 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
929 ; CHECK-NEXT: add sp, #4
930 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
932 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
933 %0 = load i32, i32* %NumInputs, align 4
934 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
935 %1 = load i32, i32* %NumFilters, align 4
936 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
937 %2 = load float*, float** %pDCTCoefs, align 4
938 %cmp = icmp ugt i32 %0, 1
939 tail call void @llvm.assume(i1 %cmp)
940 %sub = add i32 %1, -6
941 %cmp3155 = icmp ugt i32 %sub, 1
942 br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
944 for.body.preheader: ; preds = %entry
945 %n.rnd.up = add i32 %0, 3
946 %n.vec = and i32 %n.rnd.up, -4
949 for.cond.cleanup: ; preds = %middle.block, %entry
952 for.body: ; preds = %for.body.preheader, %middle.block
953 %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
954 %mul4 = mul i32 %k2.0156, %0
955 %add = add nuw i32 %k2.0156, 1
956 %mul5 = mul i32 %add, %0
957 %add6 = add i32 %k2.0156, 2
958 %mul7 = mul i32 %add6, %0
959 %add8 = add i32 %k2.0156, 3
960 %mul9 = mul i32 %add8, %0
961 %add10 = add i32 %k2.0156, 4
962 %mul11 = mul i32 %add10, %0
963 %add12 = add i32 %k2.0156, 5
964 %mul13 = mul i32 %add12, %0
965 br label %vector.body
967 vector.body: ; preds = %vector.body, %for.body
968 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
969 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
970 %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
971 %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
972 %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
973 %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
974 %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
975 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
976 %3 = getelementptr inbounds float, float* %pIn, i32 %index
977 %4 = bitcast float* %3 to <4 x float>*
978 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
979 %5 = add i32 %index, %mul4
980 %6 = getelementptr inbounds float, float* %2, i32 %5
981 %7 = bitcast float* %6 to <4 x float>*
982 %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
983 %8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
984 %9 = fadd fast <4 x float> %8, %vec.phi158
985 %10 = add i32 %index, %mul5
986 %11 = getelementptr inbounds float, float* %2, i32 %10
987 %12 = bitcast float* %11 to <4 x float>*
988 %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
989 %13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
990 %14 = fadd fast <4 x float> %13, %vec.phi160
991 %15 = add i32 %index, %mul7
992 %16 = getelementptr inbounds float, float* %2, i32 %15
993 %17 = bitcast float* %16 to <4 x float>*
994 %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
995 %18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
996 %19 = fadd fast <4 x float> %18, %vec.phi161
997 %20 = add i32 %index, %mul9
998 %21 = getelementptr inbounds float, float* %2, i32 %20
999 %22 = bitcast float* %21 to <4 x float>*
1000 %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1001 %23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
1002 %24 = fadd fast <4 x float> %23, %vec.phi159
1003 %25 = add i32 %index, %mul11
1004 %26 = getelementptr inbounds float, float* %2, i32 %25
1005 %27 = bitcast float* %26 to <4 x float>*
1006 %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1007 %28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
1008 %29 = fadd fast <4 x float> %28, %vec.phi157
1009 %30 = add i32 %index, %mul13
1010 %31 = getelementptr inbounds float, float* %2, i32 %30
1011 %32 = bitcast float* %31 to <4 x float>*
1012 %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1013 %33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
1014 %34 = fadd fast <4 x float> %33, %vec.phi
1015 %35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
1016 %36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
1017 %37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
1018 %38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
1019 %39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
1020 %40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
1021 %index.next = add i32 %index, 4
1022 %41 = icmp eq i32 %index.next, %n.vec
1023 br i1 %41, label %middle.block, label %vector.body
1025 middle.block: ; preds = %vector.body
1026 %42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
1027 %43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
1028 %44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
1029 %45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
1030 %46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
1031 %47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
1032 %arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
1033 store float %45, float* %arrayidx49, align 4
1034 %arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
1035 store float %43, float* %arrayidx51, align 4
1036 %arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
1037 store float %42, float* %arrayidx53, align 4
1038 %arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
1039 store float %44, float* %arrayidx55, align 4
1040 %arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
1041 store float %46, float* %arrayidx57, align 4
1042 %arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
1043 store float %47, float* %arrayidx59, align 4
1044 %add61 = add i32 %k2.0156, 6
1045 %cmp3 = icmp ult i32 %add61, %sub
1046 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1049 define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
1050 ; CHECK-LABEL: DCT_mve7:
1051 ; CHECK: @ %bb.0: @ %entry
1052 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1053 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1054 ; CHECK-NEXT: .pad #4
1055 ; CHECK-NEXT: sub sp, #4
1056 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1057 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1058 ; CHECK-NEXT: .pad #72
1059 ; CHECK-NEXT: sub sp, #72
1060 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
1061 ; CHECK-NEXT: ldr r1, [r0, #4]
1062 ; CHECK-NEXT: subs r1, #7
1063 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
1064 ; CHECK-NEXT: cmp r1, #2
1065 ; CHECK-NEXT: blo.w .LBB6_5
1066 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1067 ; CHECK-NEXT: ldr r3, [r0, #8]
1068 ; CHECK-NEXT: ldr r1, [r0]
1069 ; CHECK-NEXT: adds r0, r3, #3
1070 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1071 ; CHECK-NEXT: bic r0, r0, #3
1072 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2
1073 ; CHECK-NEXT: subs r1, r0, #4
1074 ; CHECK-NEXT: movs r0, #1
1075 ; CHECK-NEXT: lsls r5, r3, #2
1076 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
1077 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
1078 ; CHECK-NEXT: rsb r1, r3, r3, lsl #3
1079 ; CHECK-NEXT: lsls r1, r1, #2
1080 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
1081 ; CHECK-NEXT: .LBB6_2: @ %for.body
1082 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
1083 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
1084 ; CHECK-NEXT: adds r1, r0, #6
1085 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
1086 ; CHECK-NEXT: adds r1, r0, #5
1087 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
1088 ; CHECK-NEXT: adds r1, r0, #4
1089 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
1090 ; CHECK-NEXT: adds r1, r0, #3
1091 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
1092 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
1093 ; CHECK-NEXT: vmov.i32 q2, #0x0
1094 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
1095 ; CHECK-NEXT: adds r4, r0, #2
1096 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
1097 ; CHECK-NEXT: add.w r8, r0, #1
1098 ; CHECK-NEXT: mov r3, r9
1099 ; CHECK-NEXT: vmov q4, q2
1100 ; CHECK-NEXT: vmov q5, q2
1101 ; CHECK-NEXT: vmov q3, q2
1102 ; CHECK-NEXT: vmov q6, q2
1103 ; CHECK-NEXT: vmov q1, q2
1104 ; CHECK-NEXT: mov r12, r7
1105 ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
1106 ; CHECK-NEXT: dls lr, r6
1107 ; CHECK-NEXT: .LBB6_3: @ %vector.body
1108 ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
1109 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
1110 ; CHECK-NEXT: add.w r10, r3, r5
1111 ; CHECK-NEXT: vctp.32 r12
1112 ; CHECK-NEXT: vpsttt
1113 ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
1114 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
1115 ; CHECK-NEXT: vfmat.f32 q5, q0, q7
1116 ; CHECK-NEXT: add.w r11, r10, r5
1118 ; CHECK-NEXT: vldrwt.u32 q0, [r10]
1119 ; CHECK-NEXT: vfmat.f32 q6, q0, q7
1120 ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
1122 ; CHECK-NEXT: vldrwt.u32 q0, [r11]
1123 ; CHECK-NEXT: vfmat.f32 q1, q0, q7
1124 ; CHECK-NEXT: add.w r6, r11, r5
1125 ; CHECK-NEXT: vmov q6, q5
1126 ; CHECK-NEXT: vmov q5, q4
1127 ; CHECK-NEXT: vmov q4, q3
1129 ; CHECK-NEXT: vldrwt.u32 q0, [r6]
1130 ; CHECK-NEXT: vmov q3, q1
1131 ; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
1133 ; CHECK-NEXT: vfmat.f32 q1, q0, q7
1134 ; CHECK-NEXT: adds r7, r6, r5
1135 ; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill
1136 ; CHECK-NEXT: vmov q1, q3
1137 ; CHECK-NEXT: vmov q3, q4
1138 ; CHECK-NEXT: vmov q4, q5
1139 ; CHECK-NEXT: vmov q5, q6
1140 ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1141 ; CHECK-NEXT: sub.w r12, r12, #4
1142 ; CHECK-NEXT: adds r6, r7, r5
1144 ; CHECK-NEXT: vldrwt.u32 q0, [r7]
1145 ; CHECK-NEXT: vfmat.f32 q3, q0, q7
1146 ; CHECK-NEXT: adds r7, r6, r5
1147 ; CHECK-NEXT: vpstttt
1148 ; CHECK-NEXT: vldrwt.u32 q0, [r6]
1149 ; CHECK-NEXT: vfmat.f32 q4, q0, q7
1150 ; CHECK-NEXT: vldrwt.u32 q0, [r7]
1151 ; CHECK-NEXT: vfmat.f32 q2, q0, q7
1152 ; CHECK-NEXT: le lr, .LBB6_3
1153 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1154 ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
1155 ; CHECK-NEXT: vadd.f32 s0, s26, s27
1156 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2
1157 ; CHECK-NEXT: vadd.f32 s2, s24, s25
1158 ; CHECK-NEXT: vadd.f32 s1, s22, s23
1159 ; CHECK-NEXT: vadd.f32 s3, s20, s21
1160 ; CHECK-NEXT: vadd.f32 s6, s6, s7
1161 ; CHECK-NEXT: vadd.f32 s4, s4, s5
1162 ; CHECK-NEXT: vadd.f32 s10, s10, s11
1163 ; CHECK-NEXT: vadd.f32 s8, s8, s9
1164 ; CHECK-NEXT: vadd.f32 s0, s2, s0
1165 ; CHECK-NEXT: vadd.f32 s9, s18, s19
1166 ; CHECK-NEXT: vadd.f32 s11, s16, s17
1167 ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1168 ; CHECK-NEXT: vadd.f32 s2, s3, s1
1169 ; CHECK-NEXT: vadd.f32 s5, s18, s19
1170 ; CHECK-NEXT: vadd.f32 s7, s16, s17
1171 ; CHECK-NEXT: vadd.f32 s4, s4, s6
1172 ; CHECK-NEXT: vstr s0, [r1]
1173 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
1174 ; CHECK-NEXT: vadd.f32 s14, s14, s15
1175 ; CHECK-NEXT: adds r0, #7
1176 ; CHECK-NEXT: vadd.f32 s12, s12, s13
1177 ; CHECK-NEXT: vstr s2, [r1]
1178 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
1179 ; CHECK-NEXT: vadd.f32 s8, s8, s10
1180 ; CHECK-NEXT: vadd.f32 s6, s7, s5
1181 ; CHECK-NEXT: vstr s4, [r1]
1182 ; CHECK-NEXT: vadd.f32 s10, s11, s9
1183 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
1184 ; CHECK-NEXT: vadd.f32 s12, s12, s14
1185 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1186 ; CHECK-NEXT: vstr s6, [r1]
1187 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
1188 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1189 ; CHECK-NEXT: vstr s12, [r1]
1190 ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
1191 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1192 ; CHECK-NEXT: vstr s10, [r1]
1193 ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
1194 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1195 ; CHECK-NEXT: vstr s8, [r1]
1196 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
1197 ; CHECK-NEXT: add r9, r1
1198 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
1199 ; CHECK-NEXT: cmp r0, r1
1200 ; CHECK-NEXT: blo.w .LBB6_2
1201 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
1202 ; CHECK-NEXT: add sp, #72
1203 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1204 ; CHECK-NEXT: add sp, #4
1205 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1207 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
1208 %0 = load i32, i32* %NumInputs, align 4
1209 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
1210 %1 = load i32, i32* %NumFilters, align 4
1211 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
1212 %2 = load float*, float** %pDCTCoefs, align 4
1213 %cmp = icmp ugt i32 %0, 1
1214 tail call void @llvm.assume(i1 %cmp)
1215 %sub = add i32 %1, -7
1216 %cmp3176 = icmp ugt i32 %sub, 1
1217 br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
1219 for.body.preheader: ; preds = %entry
1220 %n.rnd.up = add i32 %0, 3
1221 %n.vec = and i32 %n.rnd.up, -4
1224 for.cond.cleanup: ; preds = %middle.block, %entry
1227 for.body: ; preds = %for.body.preheader, %middle.block
1228 %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
1229 %mul4 = mul i32 %k2.0177, %0
1230 %add = add nuw i32 %k2.0177, 1
1231 %mul5 = mul i32 %add, %0
1232 %add6 = add i32 %k2.0177, 2
1233 %mul7 = mul i32 %add6, %0
1234 %add8 = add i32 %k2.0177, 3
1235 %mul9 = mul i32 %add8, %0
1236 %add10 = add i32 %k2.0177, 4
1237 %mul11 = mul i32 %add10, %0
1238 %add12 = add i32 %k2.0177, 5
1239 %mul13 = mul i32 %add12, %0
1240 %add14 = add i32 %k2.0177, 6
1241 %mul15 = mul i32 %add14, %0
1242 br label %vector.body
1244 vector.body: ; preds = %vector.body, %for.body
1245 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1246 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
1247 %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
1248 %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
1249 %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
1250 %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
1251 %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
1252 %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
1253 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
1254 %3 = getelementptr inbounds float, float* %pIn, i32 %index
1255 %4 = bitcast float* %3 to <4 x float>*
1256 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1257 %5 = add i32 %index, %mul4
1258 %6 = getelementptr inbounds float, float* %2, i32 %5
1259 %7 = bitcast float* %6 to <4 x float>*
1260 %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1261 %8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
1262 %9 = fadd fast <4 x float> %8, %vec.phi179
1263 %10 = add i32 %index, %mul5
1264 %11 = getelementptr inbounds float, float* %2, i32 %10
1265 %12 = bitcast float* %11 to <4 x float>*
1266 %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1267 %13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
1268 %14 = fadd fast <4 x float> %13, %vec.phi181
1269 %15 = add i32 %index, %mul7
1270 %16 = getelementptr inbounds float, float* %2, i32 %15
1271 %17 = bitcast float* %16 to <4 x float>*
1272 %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1273 %18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
1274 %19 = fadd fast <4 x float> %18, %vec.phi183
1275 %20 = add i32 %index, %mul9
1276 %21 = getelementptr inbounds float, float* %2, i32 %20
1277 %22 = bitcast float* %21 to <4 x float>*
1278 %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1279 %23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
1280 %24 = fadd fast <4 x float> %23, %vec.phi182
1281 %25 = add i32 %index, %mul11
1282 %26 = getelementptr inbounds float, float* %2, i32 %25
1283 %27 = bitcast float* %26 to <4 x float>*
1284 %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1285 %28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
1286 %29 = fadd fast <4 x float> %28, %vec.phi180
1287 %30 = add i32 %index, %mul13
1288 %31 = getelementptr inbounds float, float* %2, i32 %30
1289 %32 = bitcast float* %31 to <4 x float>*
1290 %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1291 %33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
1292 %34 = fadd fast <4 x float> %33, %vec.phi178
1293 %35 = add i32 %index, %mul15
1294 %36 = getelementptr inbounds float, float* %2, i32 %35
1295 %37 = bitcast float* %36 to <4 x float>*
1296 %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1297 %38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
1298 %39 = fadd fast <4 x float> %38, %vec.phi
1299 %40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
1300 %41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
1301 %42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
1302 %43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
1303 %44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
1304 %45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
1305 %46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
1306 %index.next = add i32 %index, 4
1307 %47 = icmp eq i32 %index.next, %n.vec
1308 br i1 %47, label %middle.block, label %vector.body
1310 middle.block: ; preds = %vector.body
1311 %48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
1312 %49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
1313 %50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
1314 %51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
1315 %52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
1316 %53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
1317 %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
1318 %arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
1319 store float %52, float* %arrayidx56, align 4
1320 %arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
1321 store float %50, float* %arrayidx58, align 4
1322 %arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
1323 store float %48, float* %arrayidx60, align 4
1324 %arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
1325 store float %49, float* %arrayidx62, align 4
1326 %arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
1327 store float %51, float* %arrayidx64, align 4
1328 %arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
1329 store float %53, float* %arrayidx66, align 4
1330 %arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
1331 store float %54, float* %arrayidx68, align 4
1332 %add70 = add i32 %k2.0177, 7
1333 %cmp3 = icmp ult i32 %add70, %sub
1334 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1337 define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
1338 ; CHECK-LABEL: DCT_mve8:
1339 ; CHECK: @ %bb.0: @ %entry
1340 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1341 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1342 ; CHECK-NEXT: .pad #4
1343 ; CHECK-NEXT: sub sp, #4
1344 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1345 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1346 ; CHECK-NEXT: .pad #88
1347 ; CHECK-NEXT: sub sp, #88
1348 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
1349 ; CHECK-NEXT: ldr r1, [r0, #4]
1350 ; CHECK-NEXT: subs r1, #8
1351 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
1352 ; CHECK-NEXT: cmp r1, #2
1353 ; CHECK-NEXT: blo.w .LBB7_5
1354 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1355 ; CHECK-NEXT: ldr r3, [r0, #8]
1356 ; CHECK-NEXT: ldr r1, [r0]
1357 ; CHECK-NEXT: adds r0, r3, #3
1358 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1359 ; CHECK-NEXT: bic r0, r0, #3
1360 ; CHECK-NEXT: add.w r12, r1, r3, lsl #2
1361 ; CHECK-NEXT: subs r1, r0, #4
1362 ; CHECK-NEXT: movs r0, #1
1363 ; CHECK-NEXT: lsls r5, r3, #2
1364 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
1365 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
1366 ; CHECK-NEXT: lsls r1, r3, #5
1367 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
1368 ; CHECK-NEXT: .LBB7_2: @ %for.body
1369 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
1370 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
1371 ; CHECK-NEXT: adds r1, r0, #7
1372 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
1373 ; CHECK-NEXT: adds r1, r0, #6
1374 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
1375 ; CHECK-NEXT: adds r1, r0, #5
1376 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
1377 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
1378 ; CHECK-NEXT: adds r1, r0, #4
1379 ; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
1380 ; CHECK-NEXT: vmov.i32 q3, #0x0
1381 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
1382 ; CHECK-NEXT: adds r4, r0, #3
1383 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
1384 ; CHECK-NEXT: add.w r8, r0, #2
1385 ; CHECK-NEXT: adds r1, r0, #1
1386 ; CHECK-NEXT: mov r3, r12
1387 ; CHECK-NEXT: vmov q5, q3
1388 ; CHECK-NEXT: vmov q6, q3
1389 ; CHECK-NEXT: vmov q4, q3
1390 ; CHECK-NEXT: vmov q7, q3
1391 ; CHECK-NEXT: vmov q2, q3
1392 ; CHECK-NEXT: mov r10, r7
1393 ; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill
1394 ; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill
1395 ; CHECK-NEXT: dls lr, r6
1396 ; CHECK-NEXT: .LBB7_3: @ %vector.body
1397 ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
1398 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
1399 ; CHECK-NEXT: add.w r11, r3, r5
1400 ; CHECK-NEXT: vctp.32 r10
1401 ; CHECK-NEXT: vpsttt
1402 ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
1403 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
1404 ; CHECK-NEXT: vfmat.f32 q6, q1, q0
1405 ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
1407 ; CHECK-NEXT: vldrwt.u32 q1, [r11]
1408 ; CHECK-NEXT: vfmat.f32 q7, q1, q0
1409 ; CHECK-NEXT: add.w r6, r11, r5
1410 ; CHECK-NEXT: vmov q6, q5
1411 ; CHECK-NEXT: vmov q5, q3
1412 ; CHECK-NEXT: vmov q3, q4
1414 ; CHECK-NEXT: vldrwt.u32 q1, [r6]
1415 ; CHECK-NEXT: vmov q4, q2
1416 ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
1418 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1419 ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
1420 ; CHECK-NEXT: adds r7, r6, r5
1422 ; CHECK-NEXT: vldrwt.u32 q1, [r7]
1423 ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
1424 ; CHECK-NEXT: adds r6, r7, r5
1426 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1427 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
1428 ; CHECK-NEXT: vmov q2, q4
1429 ; CHECK-NEXT: vmov q4, q3
1430 ; CHECK-NEXT: vmov q3, q5
1431 ; CHECK-NEXT: vmov q5, q6
1432 ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1433 ; CHECK-NEXT: adds r7, r6, r5
1435 ; CHECK-NEXT: vldrwt.u32 q1, [r6]
1436 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1437 ; CHECK-NEXT: sub.w r10, r10, #4
1438 ; CHECK-NEXT: adds r6, r7, r5
1439 ; CHECK-NEXT: vpstttt
1440 ; CHECK-NEXT: vldrwt.u32 q1, [r7]
1441 ; CHECK-NEXT: vfmat.f32 q4, q1, q0
1442 ; CHECK-NEXT: vldrwt.u32 q1, [r6]
1443 ; CHECK-NEXT: vfmat.f32 q5, q1, q0
1444 ; CHECK-NEXT: add r6, r5
1446 ; CHECK-NEXT: vldrwt.u32 q1, [r6]
1447 ; CHECK-NEXT: vfmat.f32 q3, q1, q0
1448 ; CHECK-NEXT: le lr, .LBB7_3
1449 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1450 ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
1451 ; CHECK-NEXT: vadd.f32 s0, s30, s31
1452 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1453 ; CHECK-NEXT: vadd.f32 s2, s28, s29
1454 ; CHECK-NEXT: vadd.f32 s4, s26, s27
1455 ; CHECK-NEXT: vadd.f32 s6, s24, s25
1456 ; CHECK-NEXT: vadd.f32 s5, s18, s19
1457 ; CHECK-NEXT: vadd.f32 s7, s16, s17
1458 ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1459 ; CHECK-NEXT: vadd.f32 s10, s10, s11
1460 ; CHECK-NEXT: vadd.f32 s8, s8, s9
1461 ; CHECK-NEXT: vadd.f32 s9, s18, s19
1462 ; CHECK-NEXT: vadd.f32 s11, s16, s17
1463 ; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload
1464 ; CHECK-NEXT: vadd.f32 s14, s14, s15
1465 ; CHECK-NEXT: vadd.f32 s12, s12, s13
1466 ; CHECK-NEXT: vadd.f32 s13, s18, s19
1467 ; CHECK-NEXT: vadd.f32 s15, s16, s17
1468 ; CHECK-NEXT: vadd.f32 s0, s2, s0
1469 ; CHECK-NEXT: vadd.f32 s2, s6, s4
1470 ; CHECK-NEXT: vadd.f32 s8, s8, s10
1471 ; CHECK-NEXT: vadd.f32 s10, s11, s9
1472 ; CHECK-NEXT: vadd.f32 s6, s12, s14
1473 ; CHECK-NEXT: vadd.f32 s1, s22, s23
1474 ; CHECK-NEXT: vadd.f32 s14, s15, s13
1475 ; CHECK-NEXT: vstr s0, [r1]
1476 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
1477 ; CHECK-NEXT: vadd.f32 s3, s20, s21
1478 ; CHECK-NEXT: adds r0, #8
1479 ; CHECK-NEXT: vstr s2, [r1]
1480 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2
1481 ; CHECK-NEXT: vadd.f32 s12, s7, s5
1482 ; CHECK-NEXT: vstr s10, [r1]
1483 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
1484 ; CHECK-NEXT: vstr s14, [r1]
1485 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
1486 ; CHECK-NEXT: vadd.f32 s4, s3, s1
1487 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1488 ; CHECK-NEXT: vstr s8, [r1]
1489 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
1490 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1491 ; CHECK-NEXT: vstr s12, [r1]
1492 ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
1493 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1494 ; CHECK-NEXT: vstr s4, [r1]
1495 ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
1496 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1497 ; CHECK-NEXT: vstr s6, [r1]
1498 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
1499 ; CHECK-NEXT: add r12, r1
1500 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
1501 ; CHECK-NEXT: cmp r0, r1
1502 ; CHECK-NEXT: blo.w .LBB7_2
1503 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
1504 ; CHECK-NEXT: add sp, #88
1505 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1506 ; CHECK-NEXT: add sp, #4
1507 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1509 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
1510 %0 = load i32, i32* %NumInputs, align 4
1511 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
1512 %1 = load i32, i32* %NumFilters, align 4
1513 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
1514 %2 = load float*, float** %pDCTCoefs, align 4
1515 %cmp = icmp ugt i32 %0, 1
1516 tail call void @llvm.assume(i1 %cmp)
1517 %sub = add i32 %1, -8
1518 %cmp3197 = icmp ugt i32 %sub, 1
1519 br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
1521 for.body.preheader: ; preds = %entry
1522 %n.rnd.up = add i32 %0, 3
1523 %n.vec = and i32 %n.rnd.up, -4
1526 for.cond.cleanup: ; preds = %middle.block, %entry
1529 for.body: ; preds = %for.body.preheader, %middle.block
1530 %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
1531 %mul4 = mul i32 %k2.0198, %0
1532 %add = add nuw nsw i32 %k2.0198, 1
1533 %mul5 = mul i32 %add, %0
1534 %add6 = add nuw nsw i32 %k2.0198, 2
1535 %mul7 = mul i32 %add6, %0
1536 %add8 = add nuw nsw i32 %k2.0198, 3
1537 %mul9 = mul i32 %add8, %0
1538 %add10 = add nuw nsw i32 %k2.0198, 4
1539 %mul11 = mul i32 %add10, %0
1540 %add12 = add nuw nsw i32 %k2.0198, 5
1541 %mul13 = mul i32 %add12, %0
1542 %add14 = add nuw nsw i32 %k2.0198, 6
1543 %mul15 = mul i32 %add14, %0
1544 %add16 = add i32 %k2.0198, 7
1545 %mul17 = mul i32 %add16, %0
1546 br label %vector.body
1548 vector.body: ; preds = %vector.body, %for.body
1549 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1550 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
1551 %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
1552 %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
1553 %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
1554 %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
1555 %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
1556 %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
1557 %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
1558 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
1559 %3 = getelementptr inbounds float, float* %pIn, i32 %index
1560 %4 = bitcast float* %3 to <4 x float>*
1561 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1562 %5 = add i32 %index, %mul4
1563 %6 = getelementptr inbounds float, float* %2, i32 %5
1564 %7 = bitcast float* %6 to <4 x float>*
1565 %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1566 %8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
1567 %9 = fadd fast <4 x float> %8, %vec.phi200
1568 %10 = add i32 %index, %mul5
1569 %11 = getelementptr inbounds float, float* %2, i32 %10
1570 %12 = bitcast float* %11 to <4 x float>*
1571 %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1572 %13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
1573 %14 = fadd fast <4 x float> %13, %vec.phi202
1574 %15 = add i32 %index, %mul7
1575 %16 = getelementptr inbounds float, float* %2, i32 %15
1576 %17 = bitcast float* %16 to <4 x float>*
1577 %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1578 %18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
1579 %19 = fadd fast <4 x float> %18, %vec.phi204
1580 %20 = add i32 %index, %mul9
1581 %21 = getelementptr inbounds float, float* %2, i32 %20
1582 %22 = bitcast float* %21 to <4 x float>*
1583 %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1584 %23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
1585 %24 = fadd fast <4 x float> %23, %vec.phi205
1586 %25 = add i32 %index, %mul11
1587 %26 = getelementptr inbounds float, float* %2, i32 %25
1588 %27 = bitcast float* %26 to <4 x float>*
1589 %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1590 %28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
1591 %29 = fadd fast <4 x float> %28, %vec.phi203
1592 %30 = add i32 %index, %mul13
1593 %31 = getelementptr inbounds float, float* %2, i32 %30
1594 %32 = bitcast float* %31 to <4 x float>*
1595 %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1596 %33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
1597 %34 = fadd fast <4 x float> %33, %vec.phi201
1598 %35 = add i32 %index, %mul15
1599 %36 = getelementptr inbounds float, float* %2, i32 %35
1600 %37 = bitcast float* %36 to <4 x float>*
1601 %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1602 %38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
1603 %39 = fadd fast <4 x float> %38, %vec.phi199
1604 %40 = add i32 %index, %mul17
1605 %41 = getelementptr inbounds float, float* %2, i32 %40
1606 %42 = bitcast float* %41 to <4 x float>*
1607 %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1608 %43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
1609 %44 = fadd fast <4 x float> %43, %vec.phi
1610 %45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
1611 %46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
1612 %47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
1613 %48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
1614 %49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
1615 %50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
1616 %51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
1617 %52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
1618 %index.next = add i32 %index, 4
1619 %53 = icmp eq i32 %index.next, %n.vec
1620 br i1 %53, label %middle.block, label %vector.body
1622 middle.block: ; preds = %vector.body
1623 %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
1624 %55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
1625 %56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
1626 %57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
1627 %58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
1628 %59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
1629 %60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
1630 %61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
1631 %arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
1632 store float %59, float* %arrayidx63, align 4
1633 %arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
1634 store float %57, float* %arrayidx65, align 4
1635 %arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
1636 store float %55, float* %arrayidx67, align 4
1637 %arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
1638 store float %54, float* %arrayidx69, align 4
1639 %arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
1640 store float %56, float* %arrayidx71, align 4
1641 %arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
1642 store float %58, float* %arrayidx73, align 4
1643 %arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
1644 store float %60, float* %arrayidx75, align 4
1645 %arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
1646 store float %61, float* %arrayidx77, align 4
1647 %add79 = add i32 %k2.0198, 8
1648 %cmp3 = icmp ult i32 %add79, %sub
1649 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1652 declare void @llvm.assume(i1 noundef)
1653 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
1654 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
1655 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)