1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 %struct.DCT_InstanceTypeDef = type { ptr, i32, i32 }
6 define void @DCT_mve1(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
7 ; CHECK-LABEL: DCT_mve1:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: ldr r3, [r0, #4]
10 ; CHECK-NEXT: sub.w r12, r3, #1
11 ; CHECK-NEXT: cmp.w r12, #2
14 ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader
15 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
16 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
17 ; CHECK-NEXT: ldr r5, [r0, #8]
18 ; CHECK-NEXT: ldr r3, [r0]
19 ; CHECK-NEXT: add.w r3, r3, r5, lsl #2
20 ; CHECK-NEXT: movs r0, #1
21 ; CHECK-NEXT: lsl.w r9, r5, #2
22 ; CHECK-NEXT: .LBB0_2: @ %for.body
23 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
24 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
25 ; CHECK-NEXT: vmov.i32 q0, #0x0
26 ; CHECK-NEXT: mov r6, r1
27 ; CHECK-NEXT: mov r7, r3
28 ; CHECK-NEXT: dlstp.32 lr, r5
29 ; CHECK-NEXT: .LBB0_3: @ %vector.body
30 ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
31 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
32 ; CHECK-NEXT: vldrw.u32 q1, [r6], #16
33 ; CHECK-NEXT: vldrw.u32 q2, [r7], #16
34 ; CHECK-NEXT: vfma.f32 q0, q2, q1
35 ; CHECK-NEXT: letp lr, .LBB0_3
36 ; CHECK-NEXT: @ %bb.4: @ %middle.block
37 ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
38 ; CHECK-NEXT: vadd.f32 s2, s2, s3
39 ; CHECK-NEXT: add.w r7, r2, r0, lsl #2
40 ; CHECK-NEXT: vadd.f32 s0, s0, s1
41 ; CHECK-NEXT: adds r0, #1
42 ; CHECK-NEXT: add r3, r9
43 ; CHECK-NEXT: cmp r0, r12
44 ; CHECK-NEXT: vadd.f32 s0, s0, s2
45 ; CHECK-NEXT: vstr s0, [r7]
46 ; CHECK-NEXT: bne .LBB0_2
47 ; CHECK-NEXT: @ %bb.5:
48 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, lr}
51 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
52 %i = load i32, ptr %NumInputs, align 4
53 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
54 %i1 = load i32, ptr %NumFilters, align 4
55 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
56 %i2 = load ptr, ptr %pDCTCoefs, align 4
57 %cmp = icmp ugt i32 %i, 1
58 tail call void @llvm.assume(i1 %cmp)
59 %sub = add i32 %i1, -1
60 %cmp350 = icmp ugt i32 %sub, 1
61 br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
63 for.body.preheader: ; preds = %entry
64 %n.rnd.up = add i32 %i, 3
65 %n.vec = and i32 %n.rnd.up, -4
68 for.cond.cleanup: ; preds = %middle.block, %entry
71 for.body: ; preds = %middle.block, %for.body.preheader
72 %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
73 %mul4 = mul i32 %k2.051, %i
76 vector.body: ; preds = %vector.body, %for.body
77 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
78 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i10, %vector.body ]
79 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
80 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
81 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
82 %i5 = add i32 %index, %mul4
83 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
84 %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
85 %i8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
86 %i9 = fadd fast <4 x float> %i8, %vec.phi
87 %i10 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi
88 %index.next = add i32 %index, 4
89 %i11 = icmp eq i32 %index.next, %n.vec
90 br i1 %i11, label %middle.block, label %vector.body
92 middle.block: ; preds = %vector.body
93 %i12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i10)
94 %arrayidx14 = getelementptr inbounds float, ptr %pOut, i32 %k2.051
95 store float %i12, ptr %arrayidx14, align 4
96 %add16 = add nuw i32 %k2.051, 1
97 %exitcond52.not = icmp eq i32 %add16, %sub
98 br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
101 define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
102 ; CHECK-LABEL: DCT_mve2:
103 ; CHECK: @ %bb.0: @ %entry
104 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
105 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
106 ; CHECK-NEXT: .pad #4
107 ; CHECK-NEXT: sub sp, #4
108 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
109 ; CHECK-NEXT: ldr r1, [r0, #4]
110 ; CHECK-NEXT: subs r1, #2
111 ; CHECK-NEXT: cmp r1, #2
112 ; CHECK-NEXT: blo .LBB1_5
113 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
114 ; CHECK-NEXT: ldr.w r12, [r0, #8]
115 ; CHECK-NEXT: movs r4, #1
116 ; CHECK-NEXT: ldr r3, [r0]
117 ; CHECK-NEXT: add.w r11, r3, r12, lsl #2
118 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3
119 ; CHECK-NEXT: lsl.w r9, r12, #3
120 ; CHECK-NEXT: .LBB1_2: @ %for.body
121 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
122 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
123 ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
124 ; CHECK-NEXT: vmov.i32 q0, #0x0
125 ; CHECK-NEXT: add.w r10, r4, #1
126 ; CHECK-NEXT: mov r3, r11
127 ; CHECK-NEXT: mov r0, r7
128 ; CHECK-NEXT: vmov q1, q0
129 ; CHECK-NEXT: dlstp.32 lr, r12
130 ; CHECK-NEXT: .LBB1_3: @ %vector.body
131 ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
132 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
133 ; CHECK-NEXT: vldrw.u32 q2, [r5], #16
134 ; CHECK-NEXT: vldrw.u32 q3, [r3], #16
135 ; CHECK-NEXT: vfma.f32 q1, q3, q2
136 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16
137 ; CHECK-NEXT: vfma.f32 q0, q3, q2
138 ; CHECK-NEXT: letp lr, .LBB1_3
139 ; CHECK-NEXT: @ %bb.4: @ %middle.block
140 ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
141 ; CHECK-NEXT: vadd.f32 s2, s2, s3
142 ; CHECK-NEXT: add.w r0, r2, r10, lsl #2
143 ; CHECK-NEXT: vadd.f32 s0, s0, s1
144 ; CHECK-NEXT: add r11, r9
145 ; CHECK-NEXT: vadd.f32 s6, s6, s7
146 ; CHECK-NEXT: add r7, r9
147 ; CHECK-NEXT: vadd.f32 s4, s4, s5
148 ; CHECK-NEXT: vadd.f32 s0, s0, s2
149 ; CHECK-NEXT: vadd.f32 s2, s4, s6
150 ; CHECK-NEXT: vstr s0, [r0]
151 ; CHECK-NEXT: add.w r0, r2, r4, lsl #2
152 ; CHECK-NEXT: adds r4, #2
153 ; CHECK-NEXT: cmp r4, r1
154 ; CHECK-NEXT: vstr s2, [r0]
155 ; CHECK-NEXT: blo .LBB1_2
156 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
157 ; CHECK-NEXT: add sp, #4
158 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
160 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
161 %i = load i32, ptr %NumInputs, align 4
162 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
163 %i1 = load i32, ptr %NumFilters, align 4
164 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
165 %i2 = load ptr, ptr %pDCTCoefs, align 4
166 %cmp = icmp ugt i32 %i, 1
167 tail call void @llvm.assume(i1 %cmp)
168 %sub = add i32 %i1, -2
169 %cmp371 = icmp ugt i32 %sub, 1
170 br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
172 for.body.preheader: ; preds = %entry
173 %n.rnd.up = add i32 %i, 3
174 %n.vec = and i32 %n.rnd.up, -4
177 for.cond.cleanup: ; preds = %middle.block, %entry
180 for.body: ; preds = %middle.block, %for.body.preheader
181 %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
182 %mul4 = mul i32 %k2.072, %i
183 %add = add nuw i32 %k2.072, 1
184 %mul5 = mul i32 %add, %i
185 br label %vector.body
187 vector.body: ; preds = %vector.body, %for.body
188 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
189 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i15, %vector.body ]
190 %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i16, %vector.body ]
191 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
192 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
193 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
194 %i5 = add i32 %index, %mul4
195 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
196 %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
197 %i8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
198 %i9 = fadd fast <4 x float> %i8, %vec.phi73
199 %i10 = add i32 %index, %mul5
200 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
201 %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
202 %i13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
203 %i14 = fadd fast <4 x float> %i13, %vec.phi
204 %i15 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi
205 %i16 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi73
206 %index.next = add i32 %index, 4
207 %i17 = icmp eq i32 %index.next, %n.vec
208 br i1 %i17, label %middle.block, label %vector.body
210 middle.block: ; preds = %vector.body
211 %i18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i16)
212 %i19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i15)
213 %arrayidx21 = getelementptr inbounds float, ptr %pOut, i32 %k2.072
214 store float %i18, ptr %arrayidx21, align 4
215 %arrayidx23 = getelementptr inbounds float, ptr %pOut, i32 %add
216 store float %i19, ptr %arrayidx23, align 4
217 %add25 = add i32 %k2.072, 2
218 %cmp3 = icmp ult i32 %add25, %sub
219 br i1 %cmp3, label %for.body, label %for.cond.cleanup
222 define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
223 ; CHECK-LABEL: DCT_mve3:
224 ; CHECK: @ %bb.0: @ %entry
225 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
226 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
227 ; CHECK-NEXT: .pad #4
228 ; CHECK-NEXT: sub sp, #4
229 ; CHECK-NEXT: .vsave {d8, d9}
230 ; CHECK-NEXT: vpush {d8, d9}
231 ; CHECK-NEXT: .pad #24
232 ; CHECK-NEXT: sub sp, #24
233 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
234 ; CHECK-NEXT: ldr r1, [r0, #4]
235 ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
236 ; CHECK-NEXT: subs r1, #3
237 ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
238 ; CHECK-NEXT: cmp r1, #2
239 ; CHECK-NEXT: blo .LBB2_5
240 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
241 ; CHECK-NEXT: ldr r3, [r0, #8]
242 ; CHECK-NEXT: movs r5, #1
243 ; CHECK-NEXT: ldr r1, [r0]
244 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
245 ; CHECK-NEXT: add.w r0, r3, r3, lsl #1
246 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2
247 ; CHECK-NEXT: add.w r12, r1, r3, lsl #3
248 ; CHECK-NEXT: adds r3, #3
249 ; CHECK-NEXT: bic r3, r3, #3
250 ; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
251 ; CHECK-NEXT: add.w r10, r1, r0, lsl #2
252 ; CHECK-NEXT: subs r3, #4
253 ; CHECK-NEXT: lsl.w r11, r0, #2
254 ; CHECK-NEXT: add.w r1, r5, r3, lsr #2
255 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
256 ; CHECK-NEXT: .LBB2_2: @ %for.body
257 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
258 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
259 ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
260 ; CHECK-NEXT: vmov.i32 q0, #0x0
261 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
262 ; CHECK-NEXT: adds r0, r5, #2
263 ; CHECK-NEXT: adds r2, r5, #1
264 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
265 ; CHECK-NEXT: mov r3, r9
266 ; CHECK-NEXT: mov r0, r12
267 ; CHECK-NEXT: mov r4, r10
268 ; CHECK-NEXT: vmov q2, q0
269 ; CHECK-NEXT: vmov q1, q0
270 ; CHECK-NEXT: dlstp.32 lr, r7
271 ; CHECK-NEXT: .LBB2_3: @ %vector.body
272 ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
273 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
274 ; CHECK-NEXT: vldrw.u32 q3, [r6], #16
275 ; CHECK-NEXT: vldrw.u32 q4, [r3], #16
276 ; CHECK-NEXT: vfma.f32 q1, q4, q3
277 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16
278 ; CHECK-NEXT: vfma.f32 q2, q4, q3
279 ; CHECK-NEXT: vldrw.u32 q4, [r4], #16
280 ; CHECK-NEXT: vfma.f32 q0, q4, q3
281 ; CHECK-NEXT: letp lr, .LBB2_3
282 ; CHECK-NEXT: @ %bb.4: @ %middle.block
283 ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
284 ; CHECK-NEXT: vadd.f32 s10, s10, s11
285 ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
286 ; CHECK-NEXT: vadd.f32 s8, s8, s9
287 ; CHECK-NEXT: add r9, r11
288 ; CHECK-NEXT: vadd.f32 s6, s6, s7
289 ; CHECK-NEXT: add.w r0, r1, r2, lsl #2
290 ; CHECK-NEXT: vadd.f32 s4, s4, s5
291 ; CHECK-NEXT: add r12, r11
292 ; CHECK-NEXT: vadd.f32 s2, s2, s3
293 ; CHECK-NEXT: add r10, r11
294 ; CHECK-NEXT: vadd.f32 s0, s0, s1
295 ; CHECK-NEXT: vadd.f32 s8, s8, s10
296 ; CHECK-NEXT: vadd.f32 s4, s4, s6
297 ; CHECK-NEXT: vadd.f32 s0, s0, s2
298 ; CHECK-NEXT: vstr s8, [r0]
299 ; CHECK-NEXT: add.w r0, r1, r5, lsl #2
300 ; CHECK-NEXT: adds r5, #3
301 ; CHECK-NEXT: vstr s4, [r0]
302 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
303 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
304 ; CHECK-NEXT: vstr s0, [r0]
305 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
306 ; CHECK-NEXT: cmp r5, r0
307 ; CHECK-NEXT: blo .LBB2_2
308 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
309 ; CHECK-NEXT: add sp, #24
310 ; CHECK-NEXT: vpop {d8, d9}
311 ; CHECK-NEXT: add sp, #4
312 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
314 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
315 %i = load i32, ptr %NumInputs, align 4
316 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
317 %i1 = load i32, ptr %NumFilters, align 4
318 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
319 %i2 = load ptr, ptr %pDCTCoefs, align 4
320 %cmp = icmp ugt i32 %i, 1
321 tail call void @llvm.assume(i1 %cmp)
322 %sub = add i32 %i1, -3
323 %cmp392 = icmp ugt i32 %sub, 1
324 br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
326 for.body.preheader: ; preds = %entry
327 %n.rnd.up = add i32 %i, 3
328 %n.vec = and i32 %n.rnd.up, -4
331 for.cond.cleanup: ; preds = %middle.block, %entry
334 for.body: ; preds = %middle.block, %for.body.preheader
335 %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
336 %mul4 = mul i32 %k2.093, %i
337 %add = add nuw i32 %k2.093, 1
338 %mul5 = mul i32 %add, %i
339 %add6 = add i32 %k2.093, 2
340 %mul7 = mul i32 %add6, %i
341 br label %vector.body
343 vector.body: ; preds = %vector.body, %for.body
344 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
345 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i20, %vector.body ]
346 %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i21, %vector.body ]
347 %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i22, %vector.body ]
348 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
349 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
350 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
351 %i5 = add i32 %index, %mul4
352 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
353 %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
354 %i8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
355 %i9 = fadd fast <4 x float> %i8, %vec.phi95
356 %i10 = add i32 %index, %mul5
357 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
358 %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
359 %i13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
360 %i14 = fadd fast <4 x float> %i13, %vec.phi94
361 %i15 = add i32 %index, %mul7
362 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
363 %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
364 %i18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
365 %i19 = fadd fast <4 x float> %i18, %vec.phi
366 %i20 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi
367 %i21 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi94
368 %i22 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi95
369 %index.next = add i32 %index, 4
370 %i23 = icmp eq i32 %index.next, %n.vec
371 br i1 %i23, label %middle.block, label %vector.body
373 middle.block: ; preds = %vector.body
374 %i24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i22)
375 %i25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i21)
376 %i26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i20)
377 %arrayidx28 = getelementptr inbounds float, ptr %pOut, i32 %k2.093
378 store float %i24, ptr %arrayidx28, align 4
379 %arrayidx30 = getelementptr inbounds float, ptr %pOut, i32 %add
380 store float %i25, ptr %arrayidx30, align 4
381 %arrayidx32 = getelementptr inbounds float, ptr %pOut, i32 %add6
382 store float %i26, ptr %arrayidx32, align 4
383 %add34 = add i32 %k2.093, 3
384 %cmp3 = icmp ult i32 %add34, %sub
385 br i1 %cmp3, label %for.body, label %for.cond.cleanup
388 define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
389 ; CHECK-LABEL: DCT_mve4:
390 ; CHECK: @ %bb.0: @ %entry
391 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
392 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
393 ; CHECK-NEXT: .pad #4
394 ; CHECK-NEXT: sub sp, #4
395 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
396 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
397 ; CHECK-NEXT: .pad #40
398 ; CHECK-NEXT: sub sp, #40
399 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
400 ; CHECK-NEXT: ldr r1, [r0, #4]
401 ; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
402 ; CHECK-NEXT: subs r1, #4
403 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
404 ; CHECK-NEXT: cmp r1, #2
405 ; CHECK-NEXT: blo.w .LBB3_5
406 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
407 ; CHECK-NEXT: ldr r2, [r0, #8]
408 ; CHECK-NEXT: movs r6, #1
409 ; CHECK-NEXT: ldr r1, [r0]
410 ; CHECK-NEXT: add.w r0, r2, r2, lsl #1
411 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2
412 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3
413 ; CHECK-NEXT: add.w r9, r1, r2, lsl #4
414 ; CHECK-NEXT: add.w r11, r1, r0, lsl #2
415 ; CHECK-NEXT: adds r0, r2, #3
416 ; CHECK-NEXT: bic r0, r0, #3
417 ; CHECK-NEXT: subs r0, #4
418 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2
419 ; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
420 ; CHECK-NEXT: lsls r0, r2, #4
421 ; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
422 ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
423 ; CHECK-NEXT: .LBB3_2: @ %for.body
424 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
425 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
426 ; CHECK-NEXT: adds r0, r6, #3
427 ; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
428 ; CHECK-NEXT: adds r0, r6, #2
429 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
430 ; CHECK-NEXT: vmov.i32 q0, #0x0
431 ; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
432 ; CHECK-NEXT: adds r0, r6, #1
433 ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
434 ; CHECK-NEXT: mov r3, r12
435 ; CHECK-NEXT: mov r0, r8
436 ; CHECK-NEXT: mov r5, r11
437 ; CHECK-NEXT: mov r4, r9
438 ; CHECK-NEXT: vmov q1, q0
439 ; CHECK-NEXT: vmov q2, q0
440 ; CHECK-NEXT: vmov q3, q0
441 ; CHECK-NEXT: dlstp.32 lr, r7
442 ; CHECK-NEXT: .LBB3_3: @ %vector.body
443 ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
444 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
445 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16
446 ; CHECK-NEXT: vldrw.u32 q5, [r0], #16
447 ; CHECK-NEXT: vfma.f32 q3, q5, q4
448 ; CHECK-NEXT: vldrw.u32 q5, [r3], #16
449 ; CHECK-NEXT: vfma.f32 q2, q5, q4
450 ; CHECK-NEXT: vldrw.u32 q5, [r5], #16
451 ; CHECK-NEXT: vfma.f32 q1, q5, q4
452 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16
453 ; CHECK-NEXT: vfma.f32 q0, q5, q4
454 ; CHECK-NEXT: letp lr, .LBB3_3
455 ; CHECK-NEXT: @ %bb.4: @ %middle.block
456 ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
457 ; CHECK-NEXT: vadd.f32 s14, s14, s15
458 ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
459 ; CHECK-NEXT: vadd.f32 s12, s12, s13
460 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
461 ; CHECK-NEXT: vadd.f32 s10, s10, s11
462 ; CHECK-NEXT: vadd.f32 s8, s8, s9
463 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
464 ; CHECK-NEXT: vadd.f32 s6, s6, s7
465 ; CHECK-NEXT: vadd.f32 s4, s4, s5
466 ; CHECK-NEXT: vadd.f32 s2, s2, s3
467 ; CHECK-NEXT: vadd.f32 s0, s0, s1
468 ; CHECK-NEXT: vadd.f32 s12, s12, s14
469 ; CHECK-NEXT: vadd.f32 s8, s8, s10
470 ; CHECK-NEXT: vadd.f32 s4, s4, s6
471 ; CHECK-NEXT: vadd.f32 s0, s0, s2
472 ; CHECK-NEXT: vstr s12, [r0]
473 ; CHECK-NEXT: add.w r0, r1, r6, lsl #2
474 ; CHECK-NEXT: adds r6, #4
475 ; CHECK-NEXT: vstr s8, [r0]
476 ; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
477 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
478 ; CHECK-NEXT: vstr s4, [r0]
479 ; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
480 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2
481 ; CHECK-NEXT: vstr s0, [r0]
482 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
483 ; CHECK-NEXT: add r12, r0
484 ; CHECK-NEXT: add r8, r0
485 ; CHECK-NEXT: add r11, r0
486 ; CHECK-NEXT: add r9, r0
487 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
488 ; CHECK-NEXT: cmp r6, r0
489 ; CHECK-NEXT: blo .LBB3_2
490 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
491 ; CHECK-NEXT: add sp, #40
492 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
493 ; CHECK-NEXT: add sp, #4
494 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
496 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
497 %i = load i32, ptr %NumInputs, align 4
498 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
499 %i1 = load i32, ptr %NumFilters, align 4
500 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
501 %i2 = load ptr, ptr %pDCTCoefs, align 4
502 %cmp = icmp ugt i32 %i, 1
503 tail call void @llvm.assume(i1 %cmp)
504 %sub = add i32 %i1, -4
505 %cmp3113 = icmp ugt i32 %sub, 1
506 br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
508 for.body.preheader: ; preds = %entry
509 %n.rnd.up = add i32 %i, 3
510 %n.vec = and i32 %n.rnd.up, -4
513 for.cond.cleanup: ; preds = %middle.block, %entry
516 for.body: ; preds = %middle.block, %for.body.preheader
517 %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
518 %mul4 = mul i32 %k2.0114, %i
519 %add = add nuw nsw i32 %k2.0114, 1
520 %mul5 = mul i32 %add, %i
521 %add6 = add nuw nsw i32 %k2.0114, 2
522 %mul7 = mul i32 %add6, %i
523 %add8 = add i32 %k2.0114, 3
524 %mul9 = mul i32 %add8, %i
525 br label %vector.body
527 vector.body: ; preds = %vector.body, %for.body
528 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
529 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i25, %vector.body ]
530 %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i26, %vector.body ]
531 %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i27, %vector.body ]
532 %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i28, %vector.body ]
533 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
534 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
535 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
536 %i5 = add i32 %index, %mul4
537 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
538 %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
539 %i8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
540 %i9 = fadd fast <4 x float> %i8, %vec.phi116
541 %i10 = add i32 %index, %mul5
542 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
543 %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
544 %i13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
545 %i14 = fadd fast <4 x float> %i13, %vec.phi117
546 %i15 = add i32 %index, %mul7
547 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
548 %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
549 %i18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
550 %i19 = fadd fast <4 x float> %i18, %vec.phi115
551 %i20 = add i32 %index, %mul9
552 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
553 %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
554 %i23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
555 %i24 = fadd fast <4 x float> %i23, %vec.phi
556 %i25 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi
557 %i26 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi115
558 %i27 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi116
559 %i28 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi117
560 %index.next = add i32 %index, 4
561 %i29 = icmp eq i32 %index.next, %n.vec
562 br i1 %i29, label %middle.block, label %vector.body
564 middle.block: ; preds = %vector.body
565 %i30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i28)
566 %i31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i27)
567 %i32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i26)
568 %i33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i25)
569 %arrayidx35 = getelementptr inbounds float, ptr %pOut, i32 %k2.0114
570 store float %i31, ptr %arrayidx35, align 4
571 %arrayidx37 = getelementptr inbounds float, ptr %pOut, i32 %add
572 store float %i30, ptr %arrayidx37, align 4
573 %arrayidx39 = getelementptr inbounds float, ptr %pOut, i32 %add6
574 store float %i32, ptr %arrayidx39, align 4
575 %arrayidx41 = getelementptr inbounds float, ptr %pOut, i32 %add8
576 store float %i33, ptr %arrayidx41, align 4
577 %add43 = add i32 %k2.0114, 4
578 %cmp3 = icmp ult i32 %add43, %sub
579 br i1 %cmp3, label %for.body, label %for.cond.cleanup
582 define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
583 ; CHECK-LABEL: DCT_mve5:
584 ; CHECK: @ %bb.0: @ %entry
585 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
586 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
587 ; CHECK-NEXT: .pad #4
588 ; CHECK-NEXT: sub sp, #4
589 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
590 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
591 ; CHECK-NEXT: .pad #32
592 ; CHECK-NEXT: sub sp, #32
593 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
594 ; CHECK-NEXT: ldr r1, [r0, #4]
595 ; CHECK-NEXT: subs r1, #5
596 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
597 ; CHECK-NEXT: cmp r1, #2
598 ; CHECK-NEXT: blo.w .LBB4_5
599 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
600 ; CHECK-NEXT: ldr r3, [r0, #8]
601 ; CHECK-NEXT: ldr r1, [r0]
602 ; CHECK-NEXT: adds r0, r3, #3
603 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
604 ; CHECK-NEXT: bic r0, r0, #3
605 ; CHECK-NEXT: add.w r8, r1, r3, lsl #2
606 ; CHECK-NEXT: subs r1, r0, #4
607 ; CHECK-NEXT: movs r0, #1
608 ; CHECK-NEXT: lsls r5, r3, #2
609 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
610 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
611 ; CHECK-NEXT: add.w r1, r3, r3, lsl #2
612 ; CHECK-NEXT: lsls r1, r1, #2
613 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
614 ; CHECK-NEXT: .LBB4_2: @ %for.body
615 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
616 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
617 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
618 ; CHECK-NEXT: adds r1, r0, #4
619 ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
620 ; CHECK-NEXT: vmov.i32 q1, #0x0
621 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
622 ; CHECK-NEXT: add.w r10, r0, #2
623 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
624 ; CHECK-NEXT: adds r1, r0, #3
625 ; CHECK-NEXT: add.w r11, r0, #1
626 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
627 ; CHECK-NEXT: mov r3, r8
628 ; CHECK-NEXT: vmov q0, q1
629 ; CHECK-NEXT: vmov q3, q1
630 ; CHECK-NEXT: vmov q2, q1
631 ; CHECK-NEXT: vmov q4, q1
632 ; CHECK-NEXT: dlstp.32 lr, r7
633 ; CHECK-NEXT: .LBB4_3: @ %vector.body
634 ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
635 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
636 ; CHECK-NEXT: add.w r9, r3, r5
637 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16
638 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16
639 ; CHECK-NEXT: add.w r12, r9, r5
640 ; CHECK-NEXT: vfma.f32 q3, q6, q5
641 ; CHECK-NEXT: vldrw.u32 q6, [r9]
642 ; CHECK-NEXT: add.w r6, r12, r5
643 ; CHECK-NEXT: vfma.f32 q4, q6, q5
644 ; CHECK-NEXT: vldrw.u32 q6, [r12]
645 ; CHECK-NEXT: adds r7, r6, r5
646 ; CHECK-NEXT: vfma.f32 q2, q6, q5
647 ; CHECK-NEXT: vldrw.u32 q6, [r6]
648 ; CHECK-NEXT: vfma.f32 q0, q6, q5
649 ; CHECK-NEXT: vldrw.u32 q6, [r7]
650 ; CHECK-NEXT: vfma.f32 q1, q6, q5
651 ; CHECK-NEXT: letp lr, .LBB4_3
652 ; CHECK-NEXT: @ %bb.4: @ %middle.block
653 ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
654 ; CHECK-NEXT: vadd.f32 s18, s18, s19
655 ; CHECK-NEXT: add.w r1, r2, r11, lsl #2
656 ; CHECK-NEXT: vadd.f32 s16, s16, s17
657 ; CHECK-NEXT: vadd.f32 s14, s14, s15
658 ; CHECK-NEXT: vadd.f32 s12, s12, s13
659 ; CHECK-NEXT: vadd.f32 s6, s6, s7
660 ; CHECK-NEXT: vadd.f32 s4, s4, s5
661 ; CHECK-NEXT: vadd.f32 s10, s10, s11
662 ; CHECK-NEXT: vadd.f32 s8, s8, s9
663 ; CHECK-NEXT: vadd.f32 s0, s0, s1
664 ; CHECK-NEXT: vadd.f32 s1, s16, s18
665 ; CHECK-NEXT: vadd.f32 s2, s2, s3
666 ; CHECK-NEXT: vadd.f32 s12, s12, s14
667 ; CHECK-NEXT: vadd.f32 s4, s4, s6
668 ; CHECK-NEXT: vadd.f32 s6, s8, s10
669 ; CHECK-NEXT: vstr s1, [r1]
670 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
671 ; CHECK-NEXT: vadd.f32 s0, s0, s2
672 ; CHECK-NEXT: adds r0, #5
673 ; CHECK-NEXT: vstr s12, [r1]
674 ; CHECK-NEXT: add.w r1, r2, r10, lsl #2
675 ; CHECK-NEXT: vstr s6, [r1]
676 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
677 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
678 ; CHECK-NEXT: vstr s0, [r1]
679 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
680 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
681 ; CHECK-NEXT: vstr s4, [r1]
682 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
683 ; CHECK-NEXT: add r8, r1
684 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
685 ; CHECK-NEXT: cmp r0, r1
686 ; CHECK-NEXT: blo.w .LBB4_2
687 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
688 ; CHECK-NEXT: add sp, #32
689 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
690 ; CHECK-NEXT: add sp, #4
691 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
693 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
694 %i = load i32, ptr %NumInputs, align 4
695 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
696 %i1 = load i32, ptr %NumFilters, align 4
697 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
698 %i2 = load ptr, ptr %pDCTCoefs, align 4
699 %cmp = icmp ugt i32 %i, 1
700 tail call void @llvm.assume(i1 %cmp)
701 %sub = add i32 %i1, -5
702 %cmp3134 = icmp ugt i32 %sub, 1
703 br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
705 for.body.preheader: ; preds = %entry
706 %n.rnd.up = add i32 %i, 3
707 %n.vec = and i32 %n.rnd.up, -4
710 for.cond.cleanup: ; preds = %middle.block, %entry
713 for.body: ; preds = %middle.block, %for.body.preheader
714 %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
715 %mul4 = mul i32 %k2.0135, %i
716 %add = add nuw i32 %k2.0135, 1
717 %mul5 = mul i32 %add, %i
718 %add6 = add i32 %k2.0135, 2
719 %mul7 = mul i32 %add6, %i
720 %add8 = add i32 %k2.0135, 3
721 %mul9 = mul i32 %add8, %i
722 %add10 = add i32 %k2.0135, 4
723 %mul11 = mul i32 %add10, %i
724 br label %vector.body
726 vector.body: ; preds = %vector.body, %for.body
727 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
728 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i30, %vector.body ]
729 %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i31, %vector.body ]
730 %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i32, %vector.body ]
731 %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i33, %vector.body ]
732 %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i34, %vector.body ]
733 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
734 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
735 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
736 %i5 = add i32 %index, %mul4
737 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
738 %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
739 %i8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
740 %i9 = fadd fast <4 x float> %i8, %vec.phi137
741 %i10 = add i32 %index, %mul5
742 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
743 %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
744 %i13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
745 %i14 = fadd fast <4 x float> %i13, %vec.phi139
746 %i15 = add i32 %index, %mul7
747 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
748 %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
749 %i18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
750 %i19 = fadd fast <4 x float> %i18, %vec.phi138
751 %i20 = add i32 %index, %mul9
752 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
753 %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
754 %i23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
755 %i24 = fadd fast <4 x float> %i23, %vec.phi136
756 %i25 = add i32 %index, %mul11
757 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
758 %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
759 %i28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
760 %i29 = fadd fast <4 x float> %i28, %vec.phi
761 %i30 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi
762 %i31 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi136
763 %i32 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi137
764 %i33 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi138
765 %i34 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi139
766 %index.next = add i32 %index, 4
767 %i35 = icmp eq i32 %index.next, %n.vec
768 br i1 %i35, label %middle.block, label %vector.body
770 middle.block: ; preds = %vector.body
771 %i36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i34)
772 %i37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i33)
773 %i38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i32)
774 %i39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i31)
775 %i40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i30)
776 %arrayidx42 = getelementptr inbounds float, ptr %pOut, i32 %k2.0135
777 store float %i38, ptr %arrayidx42, align 4
778 %arrayidx44 = getelementptr inbounds float, ptr %pOut, i32 %add
779 store float %i36, ptr %arrayidx44, align 4
780 %arrayidx46 = getelementptr inbounds float, ptr %pOut, i32 %add6
781 store float %i37, ptr %arrayidx46, align 4
782 %arrayidx48 = getelementptr inbounds float, ptr %pOut, i32 %add8
783 store float %i39, ptr %arrayidx48, align 4
784 %arrayidx50 = getelementptr inbounds float, ptr %pOut, i32 %add10
785 store float %i40, ptr %arrayidx50, align 4
786 %add52 = add i32 %k2.0135, 5
787 %cmp3 = icmp ult i32 %add52, %sub
788 br i1 %cmp3, label %for.body, label %for.cond.cleanup
791 define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
792 ; CHECK-LABEL: DCT_mve6:
793 ; CHECK: @ %bb.0: @ %entry
794 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
795 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
796 ; CHECK-NEXT: .pad #4
797 ; CHECK-NEXT: sub sp, #4
798 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
799 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
800 ; CHECK-NEXT: .pad #32
801 ; CHECK-NEXT: sub sp, #32
802 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
803 ; CHECK-NEXT: ldr r1, [r0, #4]
804 ; CHECK-NEXT: subs r1, #6
805 ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
806 ; CHECK-NEXT: cmp r1, #2
807 ; CHECK-NEXT: blo.w .LBB5_5
808 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
809 ; CHECK-NEXT: ldr r3, [r0, #8]
810 ; CHECK-NEXT: ldr r1, [r0]
811 ; CHECK-NEXT: adds r0, r3, #3
812 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
813 ; CHECK-NEXT: bic r0, r0, #3
814 ; CHECK-NEXT: add.w r8, r1, r3, lsl #2
815 ; CHECK-NEXT: subs r1, r0, #4
816 ; CHECK-NEXT: movs r0, #1
817 ; CHECK-NEXT: lsls r5, r3, #2
818 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
819 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
820 ; CHECK-NEXT: add.w r1, r3, r3, lsl #1
821 ; CHECK-NEXT: lsls r1, r1, #3
822 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
823 ; CHECK-NEXT: .LBB5_2: @ %for.body
824 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
825 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
826 ; CHECK-NEXT: adds r1, r0, #5
827 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
828 ; CHECK-NEXT: adds r1, r0, #4
829 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
830 ; CHECK-NEXT: adds r1, r0, #3
831 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
832 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
833 ; CHECK-NEXT: vmov.i32 q1, #0x0
834 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
835 ; CHECK-NEXT: add.w r11, r0, #2
836 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
837 ; CHECK-NEXT: adds r4, r0, #1
838 ; CHECK-NEXT: mov r3, r8
839 ; CHECK-NEXT: vmov q3, q1
840 ; CHECK-NEXT: vmov q4, q1
841 ; CHECK-NEXT: vmov q0, q1
842 ; CHECK-NEXT: vmov q5, q1
843 ; CHECK-NEXT: vmov q2, q1
844 ; CHECK-NEXT: dlstp.32 lr, r7
845 ; CHECK-NEXT: .LBB5_3: @ %vector.body
846 ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
847 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
848 ; CHECK-NEXT: add.w r12, r3, r5
849 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16
850 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16
851 ; CHECK-NEXT: add.w r10, r12, r5
852 ; CHECK-NEXT: vfma.f32 q4, q7, q6
853 ; CHECK-NEXT: vldrw.u32 q7, [r12]
854 ; CHECK-NEXT: add.w r6, r10, r5
855 ; CHECK-NEXT: vfma.f32 q5, q7, q6
856 ; CHECK-NEXT: vldrw.u32 q7, [r10]
857 ; CHECK-NEXT: adds r7, r6, r5
858 ; CHECK-NEXT: vfma.f32 q2, q7, q6
859 ; CHECK-NEXT: vldrw.u32 q7, [r6]
860 ; CHECK-NEXT: adds r6, r7, r5
861 ; CHECK-NEXT: vfma.f32 q0, q7, q6
862 ; CHECK-NEXT: vldrw.u32 q7, [r7]
863 ; CHECK-NEXT: vfma.f32 q3, q7, q6
864 ; CHECK-NEXT: vldrw.u32 q7, [r6]
865 ; CHECK-NEXT: vfma.f32 q1, q7, q6
866 ; CHECK-NEXT: letp lr, .LBB5_3
867 ; CHECK-NEXT: @ %bb.4: @ %middle.block
868 ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
869 ; CHECK-NEXT: vadd.f32 s22, s22, s23
870 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
871 ; CHECK-NEXT: vadd.f32 s20, s20, s21
872 ; CHECK-NEXT: vadd.f32 s18, s18, s19
873 ; CHECK-NEXT: vadd.f32 s16, s16, s17
874 ; CHECK-NEXT: vadd.f32 s10, s10, s11
875 ; CHECK-NEXT: vadd.f32 s8, s8, s9
876 ; CHECK-NEXT: vadd.f32 s0, s0, s1
877 ; CHECK-NEXT: vadd.f32 s2, s2, s3
878 ; CHECK-NEXT: vadd.f32 s1, s20, s22
879 ; CHECK-NEXT: vadd.f32 s6, s6, s7
880 ; CHECK-NEXT: vadd.f32 s3, s16, s18
881 ; CHECK-NEXT: vadd.f32 s4, s4, s5
882 ; CHECK-NEXT: vadd.f32 s8, s8, s10
883 ; CHECK-NEXT: vadd.f32 s14, s14, s15
884 ; CHECK-NEXT: vadd.f32 s12, s12, s13
885 ; CHECK-NEXT: vstr s1, [r1]
886 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
887 ; CHECK-NEXT: vadd.f32 s0, s0, s2
888 ; CHECK-NEXT: adds r0, #6
889 ; CHECK-NEXT: vstr s3, [r1]
890 ; CHECK-NEXT: add.w r1, r2, r11, lsl #2
891 ; CHECK-NEXT: vadd.f32 s4, s4, s6
892 ; CHECK-NEXT: vstr s8, [r1]
893 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
894 ; CHECK-NEXT: vadd.f32 s6, s12, s14
895 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
896 ; CHECK-NEXT: vstr s0, [r1]
897 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
898 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
899 ; CHECK-NEXT: vstr s6, [r1]
900 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
901 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
902 ; CHECK-NEXT: vstr s4, [r1]
903 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
904 ; CHECK-NEXT: add r8, r1
905 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
906 ; CHECK-NEXT: cmp r0, r1
907 ; CHECK-NEXT: blo.w .LBB5_2
908 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
909 ; CHECK-NEXT: add sp, #32
910 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
911 ; CHECK-NEXT: add sp, #4
912 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
914 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
915 %i = load i32, ptr %NumInputs, align 4
916 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
917 %i1 = load i32, ptr %NumFilters, align 4
918 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
919 %i2 = load ptr, ptr %pDCTCoefs, align 4
920 %cmp = icmp ugt i32 %i, 1
921 tail call void @llvm.assume(i1 %cmp)
922 %sub = add i32 %i1, -6
923 %cmp3155 = icmp ugt i32 %sub, 1
924 br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
926 for.body.preheader: ; preds = %entry
927 %n.rnd.up = add i32 %i, 3
928 %n.vec = and i32 %n.rnd.up, -4
931 for.cond.cleanup: ; preds = %middle.block, %entry
934 for.body: ; preds = %middle.block, %for.body.preheader
935 %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
936 %mul4 = mul i32 %k2.0156, %i
937 %add = add nuw i32 %k2.0156, 1
938 %mul5 = mul i32 %add, %i
939 %add6 = add i32 %k2.0156, 2
940 %mul7 = mul i32 %add6, %i
941 %add8 = add i32 %k2.0156, 3
942 %mul9 = mul i32 %add8, %i
943 %add10 = add i32 %k2.0156, 4
944 %mul11 = mul i32 %add10, %i
945 %add12 = add i32 %k2.0156, 5
946 %mul13 = mul i32 %add12, %i
947 br label %vector.body
949 vector.body: ; preds = %vector.body, %for.body
950 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
951 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i35, %vector.body ]
952 %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i36, %vector.body ]
953 %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i37, %vector.body ]
954 %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i38, %vector.body ]
955 %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i39, %vector.body ]
956 %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ]
957 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
958 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
959 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
960 %i5 = add i32 %index, %mul4
961 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
962 %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
963 %i8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
964 %i9 = fadd fast <4 x float> %i8, %vec.phi158
965 %i10 = add i32 %index, %mul5
966 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
967 %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
968 %i13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
969 %i14 = fadd fast <4 x float> %i13, %vec.phi160
970 %i15 = add i32 %index, %mul7
971 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
972 %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
973 %i18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
974 %i19 = fadd fast <4 x float> %i18, %vec.phi161
975 %i20 = add i32 %index, %mul9
976 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
977 %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
978 %i23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
979 %i24 = fadd fast <4 x float> %i23, %vec.phi159
980 %i25 = add i32 %index, %mul11
981 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
982 %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
983 %i28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
984 %i29 = fadd fast <4 x float> %i28, %vec.phi157
985 %i30 = add i32 %index, %mul13
986 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
987 %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
988 %i33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
989 %i34 = fadd fast <4 x float> %i33, %vec.phi
990 %i35 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi
991 %i36 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi157
992 %i37 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi158
993 %i38 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi159
994 %i39 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi160
995 %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi161
996 %index.next = add i32 %index, 4
997 %i41 = icmp eq i32 %index.next, %n.vec
998 br i1 %i41, label %middle.block, label %vector.body
1000 middle.block: ; preds = %vector.body
1001 %i42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40)
1002 %i43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i39)
1003 %i44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i38)
1004 %i45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i37)
1005 %i46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i36)
1006 %i47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i35)
1007 %arrayidx49 = getelementptr inbounds float, ptr %pOut, i32 %k2.0156
1008 store float %i45, ptr %arrayidx49, align 4
1009 %arrayidx51 = getelementptr inbounds float, ptr %pOut, i32 %add
1010 store float %i43, ptr %arrayidx51, align 4
1011 %arrayidx53 = getelementptr inbounds float, ptr %pOut, i32 %add6
1012 store float %i42, ptr %arrayidx53, align 4
1013 %arrayidx55 = getelementptr inbounds float, ptr %pOut, i32 %add8
1014 store float %i44, ptr %arrayidx55, align 4
1015 %arrayidx57 = getelementptr inbounds float, ptr %pOut, i32 %add10
1016 store float %i46, ptr %arrayidx57, align 4
1017 %arrayidx59 = getelementptr inbounds float, ptr %pOut, i32 %add12
1018 store float %i47, ptr %arrayidx59, align 4
1019 %add61 = add i32 %k2.0156, 6
1020 %cmp3 = icmp ult i32 %add61, %sub
1021 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1024 define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
1025 ; CHECK-LABEL: DCT_mve7:
1026 ; CHECK: @ %bb.0: @ %entry
1027 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1028 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1029 ; CHECK-NEXT: .pad #4
1030 ; CHECK-NEXT: sub sp, #4
1031 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1032 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1033 ; CHECK-NEXT: .pad #72
1034 ; CHECK-NEXT: sub sp, #72
1035 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
1036 ; CHECK-NEXT: ldr r1, [r0, #4]
1037 ; CHECK-NEXT: subs r1, #7
1038 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
1039 ; CHECK-NEXT: cmp r1, #2
1040 ; CHECK-NEXT: blo.w .LBB6_5
1041 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1042 ; CHECK-NEXT: ldr r3, [r0, #8]
1043 ; CHECK-NEXT: ldr r1, [r0]
1044 ; CHECK-NEXT: adds r0, r3, #3
1045 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1046 ; CHECK-NEXT: bic r0, r0, #3
1047 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2
1048 ; CHECK-NEXT: subs r1, r0, #4
1049 ; CHECK-NEXT: movs r0, #1
1050 ; CHECK-NEXT: lsls r5, r3, #2
1051 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
1052 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
1053 ; CHECK-NEXT: rsb r1, r3, r3, lsl #3
1054 ; CHECK-NEXT: lsls r1, r1, #2
1055 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
1056 ; CHECK-NEXT: .LBB6_2: @ %for.body
1057 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
1058 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
1059 ; CHECK-NEXT: adds r1, r0, #6
1060 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
1061 ; CHECK-NEXT: adds r1, r0, #5
1062 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
1063 ; CHECK-NEXT: adds r1, r0, #4
1064 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
1065 ; CHECK-NEXT: adds r1, r0, #3
1066 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
1067 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
1068 ; CHECK-NEXT: vmov.i32 q2, #0x0
1069 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
1070 ; CHECK-NEXT: adds r4, r0, #2
1071 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
1072 ; CHECK-NEXT: add.w r8, r0, #1
1073 ; CHECK-NEXT: mov r3, r9
1074 ; CHECK-NEXT: vmov q4, q2
1075 ; CHECK-NEXT: vmov q5, q2
1076 ; CHECK-NEXT: vmov q3, q2
1077 ; CHECK-NEXT: vmov q6, q2
1078 ; CHECK-NEXT: vmov q1, q2
1079 ; CHECK-NEXT: mov r12, r7
1080 ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
1081 ; CHECK-NEXT: dls lr, r6
1082 ; CHECK-NEXT: .LBB6_3: @ %vector.body
1083 ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
1084 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
1085 ; CHECK-NEXT: vctp.32 r12
1086 ; CHECK-NEXT: add.w r10, r3, r5
1088 ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
1089 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
1090 ; CHECK-NEXT: add.w r11, r10, r5
1091 ; CHECK-NEXT: sub.w r12, r12, #4
1093 ; CHECK-NEXT: vfmat.f32 q5, q0, q7
1094 ; CHECK-NEXT: vldrwt.u32 q0, [r10]
1095 ; CHECK-NEXT: add.w r6, r11, r5
1097 ; CHECK-NEXT: vfmat.f32 q6, q0, q7
1098 ; CHECK-NEXT: vldrwt.u32 q0, [r11]
1099 ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
1100 ; CHECK-NEXT: vmov q6, q5
1102 ; CHECK-NEXT: vfmat.f32 q1, q0, q7
1103 ; CHECK-NEXT: vmov q5, q4
1104 ; CHECK-NEXT: vmov q4, q3
1105 ; CHECK-NEXT: vmov q3, q1
1107 ; CHECK-NEXT: vldrwt.u32 q0, [r6]
1108 ; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
1109 ; CHECK-NEXT: adds r7, r6, r5
1111 ; CHECK-NEXT: vfmat.f32 q1, q0, q7
1112 ; CHECK-NEXT: vldrwt.u32 q0, [r7]
1113 ; CHECK-NEXT: adds r6, r7, r5
1114 ; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill
1115 ; CHECK-NEXT: vmov q1, q3
1116 ; CHECK-NEXT: vmov q3, q4
1118 ; CHECK-NEXT: vfmat.f32 q3, q0, q7
1119 ; CHECK-NEXT: vldrwt.u32 q0, [r6]
1120 ; CHECK-NEXT: vmov q4, q5
1121 ; CHECK-NEXT: adds r7, r6, r5
1123 ; CHECK-NEXT: vfmat.f32 q4, q0, q7
1124 ; CHECK-NEXT: vldrwt.u32 q0, [r7]
1125 ; CHECK-NEXT: vmov q5, q6
1126 ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1128 ; CHECK-NEXT: vfmat.f32 q2, q0, q7
1129 ; CHECK-NEXT: le lr, .LBB6_3
1130 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1131 ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
1132 ; CHECK-NEXT: vadd.f32 s0, s26, s27
1133 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2
1134 ; CHECK-NEXT: vadd.f32 s2, s24, s25
1135 ; CHECK-NEXT: vadd.f32 s1, s22, s23
1136 ; CHECK-NEXT: vadd.f32 s3, s20, s21
1137 ; CHECK-NEXT: vadd.f32 s6, s6, s7
1138 ; CHECK-NEXT: vadd.f32 s4, s4, s5
1139 ; CHECK-NEXT: vadd.f32 s10, s10, s11
1140 ; CHECK-NEXT: vadd.f32 s8, s8, s9
1141 ; CHECK-NEXT: vadd.f32 s0, s2, s0
1142 ; CHECK-NEXT: vadd.f32 s9, s18, s19
1143 ; CHECK-NEXT: vadd.f32 s11, s16, s17
1144 ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1145 ; CHECK-NEXT: vadd.f32 s2, s3, s1
1146 ; CHECK-NEXT: vadd.f32 s5, s18, s19
1147 ; CHECK-NEXT: vadd.f32 s7, s16, s17
1148 ; CHECK-NEXT: vadd.f32 s4, s4, s6
1149 ; CHECK-NEXT: vstr s0, [r1]
1150 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
1151 ; CHECK-NEXT: vadd.f32 s14, s14, s15
1152 ; CHECK-NEXT: adds r0, #7
1153 ; CHECK-NEXT: vadd.f32 s12, s12, s13
1154 ; CHECK-NEXT: vstr s2, [r1]
1155 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
1156 ; CHECK-NEXT: vadd.f32 s8, s8, s10
1157 ; CHECK-NEXT: vadd.f32 s6, s7, s5
1158 ; CHECK-NEXT: vstr s4, [r1]
1159 ; CHECK-NEXT: vadd.f32 s10, s11, s9
1160 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
1161 ; CHECK-NEXT: vadd.f32 s12, s12, s14
1162 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1163 ; CHECK-NEXT: vstr s6, [r1]
1164 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
1165 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1166 ; CHECK-NEXT: vstr s12, [r1]
1167 ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
1168 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1169 ; CHECK-NEXT: vstr s10, [r1]
1170 ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
1171 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1172 ; CHECK-NEXT: vstr s8, [r1]
1173 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
1174 ; CHECK-NEXT: add r9, r1
1175 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
1176 ; CHECK-NEXT: cmp r0, r1
1177 ; CHECK-NEXT: blo.w .LBB6_2
1178 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
1179 ; CHECK-NEXT: add sp, #72
1180 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1181 ; CHECK-NEXT: add sp, #4
1182 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1184 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
1185 %i = load i32, ptr %NumInputs, align 4
1186 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
1187 %i1 = load i32, ptr %NumFilters, align 4
1188 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
1189 %i2 = load ptr, ptr %pDCTCoefs, align 4
1190 %cmp = icmp ugt i32 %i, 1
1191 tail call void @llvm.assume(i1 %cmp)
1192 %sub = add i32 %i1, -7
1193 %cmp3176 = icmp ugt i32 %sub, 1
1194 br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
1196 for.body.preheader: ; preds = %entry
1197 %n.rnd.up = add i32 %i, 3
1198 %n.vec = and i32 %n.rnd.up, -4
1201 for.cond.cleanup: ; preds = %middle.block, %entry
1204 for.body: ; preds = %middle.block, %for.body.preheader
1205 %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
1206 %mul4 = mul i32 %k2.0177, %i
1207 %add = add nuw i32 %k2.0177, 1
1208 %mul5 = mul i32 %add, %i
1209 %add6 = add i32 %k2.0177, 2
1210 %mul7 = mul i32 %add6, %i
1211 %add8 = add i32 %k2.0177, 3
1212 %mul9 = mul i32 %add8, %i
1213 %add10 = add i32 %k2.0177, 4
1214 %mul11 = mul i32 %add10, %i
1215 %add12 = add i32 %k2.0177, 5
1216 %mul13 = mul i32 %add12, %i
1217 %add14 = add i32 %k2.0177, 6
1218 %mul15 = mul i32 %add14, %i
1219 br label %vector.body
1221 vector.body: ; preds = %vector.body, %for.body
1222 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1223 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ]
1224 %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i41, %vector.body ]
1225 %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i42, %vector.body ]
1226 %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i43, %vector.body ]
1227 %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i44, %vector.body ]
1228 %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ]
1229 %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ]
1230 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
1231 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
1232 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1233 %i5 = add i32 %index, %mul4
1234 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
1235 %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1236 %i8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
1237 %i9 = fadd fast <4 x float> %i8, %vec.phi179
1238 %i10 = add i32 %index, %mul5
1239 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
1240 %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1241 %i13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
1242 %i14 = fadd fast <4 x float> %i13, %vec.phi181
1243 %i15 = add i32 %index, %mul7
1244 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
1245 %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1246 %i18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
1247 %i19 = fadd fast <4 x float> %i18, %vec.phi183
1248 %i20 = add i32 %index, %mul9
1249 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
1250 %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1251 %i23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
1252 %i24 = fadd fast <4 x float> %i23, %vec.phi182
1253 %i25 = add i32 %index, %mul11
1254 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
1255 %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1256 %i28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
1257 %i29 = fadd fast <4 x float> %i28, %vec.phi180
1258 %i30 = add i32 %index, %mul13
1259 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
1260 %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1261 %i33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
1262 %i34 = fadd fast <4 x float> %i33, %vec.phi178
1263 %i35 = add i32 %index, %mul15
1264 %i36 = getelementptr inbounds float, ptr %i2, i32 %i35
1265 %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1266 %i38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
1267 %i39 = fadd fast <4 x float> %i38, %vec.phi
1268 %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi
1269 %i41 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi178
1270 %i42 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi179
1271 %i43 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi180
1272 %i44 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi181
1273 %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi182
1274 %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi183
1275 %index.next = add i32 %index, 4
1276 %i47 = icmp eq i32 %index.next, %n.vec
1277 br i1 %i47, label %middle.block, label %vector.body
1279 middle.block: ; preds = %vector.body
1280 %i48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46)
1281 %i49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45)
1282 %i50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i44)
1283 %i51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i43)
1284 %i52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i42)
1285 %i53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i41)
1286 %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40)
1287 %arrayidx56 = getelementptr inbounds float, ptr %pOut, i32 %k2.0177
1288 store float %i52, ptr %arrayidx56, align 4
1289 %arrayidx58 = getelementptr inbounds float, ptr %pOut, i32 %add
1290 store float %i50, ptr %arrayidx58, align 4
1291 %arrayidx60 = getelementptr inbounds float, ptr %pOut, i32 %add6
1292 store float %i48, ptr %arrayidx60, align 4
1293 %arrayidx62 = getelementptr inbounds float, ptr %pOut, i32 %add8
1294 store float %i49, ptr %arrayidx62, align 4
1295 %arrayidx64 = getelementptr inbounds float, ptr %pOut, i32 %add10
1296 store float %i51, ptr %arrayidx64, align 4
1297 %arrayidx66 = getelementptr inbounds float, ptr %pOut, i32 %add12
1298 store float %i53, ptr %arrayidx66, align 4
1299 %arrayidx68 = getelementptr inbounds float, ptr %pOut, i32 %add14
1300 store float %i54, ptr %arrayidx68, align 4
1301 %add70 = add i32 %k2.0177, 7
1302 %cmp3 = icmp ult i32 %add70, %sub
1303 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1306 define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
1307 ; CHECK-LABEL: DCT_mve8:
1308 ; CHECK: @ %bb.0: @ %entry
1309 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1310 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1311 ; CHECK-NEXT: .pad #4
1312 ; CHECK-NEXT: sub sp, #4
1313 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1314 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1315 ; CHECK-NEXT: .pad #88
1316 ; CHECK-NEXT: sub sp, #88
1317 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
1318 ; CHECK-NEXT: ldr r1, [r0, #4]
1319 ; CHECK-NEXT: subs r1, #8
1320 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
1321 ; CHECK-NEXT: cmp r1, #2
1322 ; CHECK-NEXT: blo.w .LBB7_5
1323 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1324 ; CHECK-NEXT: ldr r3, [r0, #8]
1325 ; CHECK-NEXT: ldr r1, [r0]
1326 ; CHECK-NEXT: adds r0, r3, #3
1327 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1328 ; CHECK-NEXT: bic r0, r0, #3
1329 ; CHECK-NEXT: add.w r12, r1, r3, lsl #2
1330 ; CHECK-NEXT: subs r1, r0, #4
1331 ; CHECK-NEXT: movs r0, #1
1332 ; CHECK-NEXT: lsls r6, r3, #2
1333 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2
1334 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
1335 ; CHECK-NEXT: lsls r1, r3, #5
1336 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
1337 ; CHECK-NEXT: .LBB7_2: @ %for.body
1338 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
1339 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
1340 ; CHECK-NEXT: adds r1, r0, #7
1341 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
1342 ; CHECK-NEXT: adds r1, r0, #6
1343 ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
1344 ; CHECK-NEXT: adds r1, r0, #5
1345 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
1346 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
1347 ; CHECK-NEXT: adds r1, r0, #4
1348 ; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
1349 ; CHECK-NEXT: vmov.i32 q3, #0x0
1350 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
1351 ; CHECK-NEXT: adds r4, r0, #3
1352 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
1353 ; CHECK-NEXT: add.w r8, r0, #2
1354 ; CHECK-NEXT: adds r1, r0, #1
1355 ; CHECK-NEXT: mov r3, r12
1356 ; CHECK-NEXT: vmov q5, q3
1357 ; CHECK-NEXT: vmov q6, q3
1358 ; CHECK-NEXT: vmov q4, q3
1359 ; CHECK-NEXT: vmov q7, q3
1360 ; CHECK-NEXT: vmov q2, q3
1361 ; CHECK-NEXT: mov r10, r7
1362 ; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill
1363 ; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill
1364 ; CHECK-NEXT: dls lr, r5
1365 ; CHECK-NEXT: .LBB7_3: @ %vector.body
1366 ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
1367 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
1368 ; CHECK-NEXT: vctp.32 r10
1369 ; CHECK-NEXT: add.w r11, r3, r6
1371 ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
1372 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
1373 ; CHECK-NEXT: add.w r5, r11, r6
1374 ; CHECK-NEXT: sub.w r10, r10, #4
1376 ; CHECK-NEXT: vfmat.f32 q6, q1, q0
1377 ; CHECK-NEXT: vldrwt.u32 q1, [r11]
1378 ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
1379 ; CHECK-NEXT: vmov q6, q5
1381 ; CHECK-NEXT: vfmat.f32 q7, q1, q0
1382 ; CHECK-NEXT: vmov q5, q3
1383 ; CHECK-NEXT: vmov q3, q4
1384 ; CHECK-NEXT: vmov q4, q2
1386 ; CHECK-NEXT: vldrwt.u32 q1, [r5]
1387 ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
1388 ; CHECK-NEXT: adds r7, r5, r6
1390 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1391 ; CHECK-NEXT: vldrwt.u32 q1, [r7]
1392 ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
1393 ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
1394 ; CHECK-NEXT: adds r5, r7, r6
1396 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1397 ; CHECK-NEXT: vldrwt.u32 q1, [r5]
1398 ; CHECK-NEXT: adds r7, r5, r6
1399 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
1400 ; CHECK-NEXT: vmov q2, q4
1401 ; CHECK-NEXT: vmov q4, q3
1403 ; CHECK-NEXT: vfmat.f32 q2, q1, q0
1404 ; CHECK-NEXT: vldrwt.u32 q1, [r7]
1405 ; CHECK-NEXT: adds r5, r7, r6
1406 ; CHECK-NEXT: vmov q3, q5
1408 ; CHECK-NEXT: vfmat.f32 q4, q1, q0
1409 ; CHECK-NEXT: vldrwt.u32 q1, [r5]
1410 ; CHECK-NEXT: vmov q5, q6
1411 ; CHECK-NEXT: add r5, r6
1413 ; CHECK-NEXT: vfmat.f32 q5, q1, q0
1414 ; CHECK-NEXT: vldrwt.u32 q1, [r5]
1415 ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1417 ; CHECK-NEXT: vfmat.f32 q3, q1, q0
1418 ; CHECK-NEXT: le lr, .LBB7_3
1419 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1420 ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
1421 ; CHECK-NEXT: vadd.f32 s0, s30, s31
1422 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1423 ; CHECK-NEXT: vadd.f32 s2, s28, s29
1424 ; CHECK-NEXT: vadd.f32 s4, s26, s27
1425 ; CHECK-NEXT: vadd.f32 s6, s24, s25
1426 ; CHECK-NEXT: vadd.f32 s5, s18, s19
1427 ; CHECK-NEXT: vadd.f32 s7, s16, s17
1428 ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1429 ; CHECK-NEXT: vadd.f32 s10, s10, s11
1430 ; CHECK-NEXT: vadd.f32 s8, s8, s9
1431 ; CHECK-NEXT: vadd.f32 s9, s18, s19
1432 ; CHECK-NEXT: vadd.f32 s11, s16, s17
1433 ; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload
1434 ; CHECK-NEXT: vadd.f32 s14, s14, s15
1435 ; CHECK-NEXT: vadd.f32 s12, s12, s13
1436 ; CHECK-NEXT: vadd.f32 s13, s18, s19
1437 ; CHECK-NEXT: vadd.f32 s15, s16, s17
1438 ; CHECK-NEXT: vadd.f32 s0, s2, s0
1439 ; CHECK-NEXT: vadd.f32 s2, s6, s4
1440 ; CHECK-NEXT: vadd.f32 s8, s8, s10
1441 ; CHECK-NEXT: vadd.f32 s10, s11, s9
1442 ; CHECK-NEXT: vadd.f32 s6, s12, s14
1443 ; CHECK-NEXT: vadd.f32 s1, s22, s23
1444 ; CHECK-NEXT: vadd.f32 s14, s15, s13
1445 ; CHECK-NEXT: vstr s0, [r1]
1446 ; CHECK-NEXT: add.w r1, r2, r0, lsl #2
1447 ; CHECK-NEXT: vadd.f32 s3, s20, s21
1448 ; CHECK-NEXT: adds r0, #8
1449 ; CHECK-NEXT: vstr s2, [r1]
1450 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2
1451 ; CHECK-NEXT: vadd.f32 s12, s7, s5
1452 ; CHECK-NEXT: vstr s10, [r1]
1453 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2
1454 ; CHECK-NEXT: vstr s14, [r1]
1455 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
1456 ; CHECK-NEXT: vadd.f32 s4, s3, s1
1457 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1458 ; CHECK-NEXT: vstr s8, [r1]
1459 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
1460 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1461 ; CHECK-NEXT: vstr s12, [r1]
1462 ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
1463 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1464 ; CHECK-NEXT: vstr s4, [r1]
1465 ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
1466 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2
1467 ; CHECK-NEXT: vstr s6, [r1]
1468 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
1469 ; CHECK-NEXT: add r12, r1
1470 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
1471 ; CHECK-NEXT: cmp r0, r1
1472 ; CHECK-NEXT: blo.w .LBB7_2
1473 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
1474 ; CHECK-NEXT: add sp, #88
1475 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1476 ; CHECK-NEXT: add sp, #4
1477 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1479 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
1480 %i = load i32, ptr %NumInputs, align 4
1481 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
1482 %i1 = load i32, ptr %NumFilters, align 4
1483 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
1484 %i2 = load ptr, ptr %pDCTCoefs, align 4
1485 %cmp = icmp ugt i32 %i, 1
1486 tail call void @llvm.assume(i1 %cmp)
1487 %sub = add i32 %i1, -8
1488 %cmp3197 = icmp ugt i32 %sub, 1
1489 br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
1491 for.body.preheader: ; preds = %entry
1492 %n.rnd.up = add i32 %i, 3
1493 %n.vec = and i32 %n.rnd.up, -4
1496 for.cond.cleanup: ; preds = %middle.block, %entry
1499 for.body: ; preds = %middle.block, %for.body.preheader
1500 %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
1501 %mul4 = mul i32 %k2.0198, %i
1502 %add = add nuw nsw i32 %k2.0198, 1
1503 %mul5 = mul i32 %add, %i
1504 %add6 = add nuw nsw i32 %k2.0198, 2
1505 %mul7 = mul i32 %add6, %i
1506 %add8 = add nuw nsw i32 %k2.0198, 3
1507 %mul9 = mul i32 %add8, %i
1508 %add10 = add nuw nsw i32 %k2.0198, 4
1509 %mul11 = mul i32 %add10, %i
1510 %add12 = add nuw nsw i32 %k2.0198, 5
1511 %mul13 = mul i32 %add12, %i
1512 %add14 = add nuw nsw i32 %k2.0198, 6
1513 %mul15 = mul i32 %add14, %i
1514 %add16 = add i32 %k2.0198, 7
1515 %mul17 = mul i32 %add16, %i
1516 br label %vector.body
1518 vector.body: ; preds = %vector.body, %for.body
1519 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1520 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ]
1521 %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ]
1522 %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i47, %vector.body ]
1523 %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i48, %vector.body ]
1524 %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i49, %vector.body ]
1525 %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i50, %vector.body ]
1526 %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i51, %vector.body ]
1527 %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i52, %vector.body ]
1528 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
1529 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
1530 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1531 %i5 = add i32 %index, %mul4
1532 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
1533 %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1534 %i8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
1535 %i9 = fadd fast <4 x float> %i8, %vec.phi200
1536 %i10 = add i32 %index, %mul5
1537 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
1538 %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1539 %i13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
1540 %i14 = fadd fast <4 x float> %i13, %vec.phi202
1541 %i15 = add i32 %index, %mul7
1542 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
1543 %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1544 %i18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
1545 %i19 = fadd fast <4 x float> %i18, %vec.phi204
1546 %i20 = add i32 %index, %mul9
1547 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
1548 %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1549 %i23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
1550 %i24 = fadd fast <4 x float> %i23, %vec.phi205
1551 %i25 = add i32 %index, %mul11
1552 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
1553 %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1554 %i28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
1555 %i29 = fadd fast <4 x float> %i28, %vec.phi203
1556 %i30 = add i32 %index, %mul13
1557 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
1558 %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1559 %i33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
1560 %i34 = fadd fast <4 x float> %i33, %vec.phi201
1561 %i35 = add i32 %index, %mul15
1562 %i36 = getelementptr inbounds float, ptr %i2, i32 %i35
1563 %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1564 %i38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
1565 %i39 = fadd fast <4 x float> %i38, %vec.phi199
1566 %i40 = add i32 %index, %mul17
1567 %i41 = getelementptr inbounds float, ptr %i2, i32 %i40
1568 %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i41, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1569 %i43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
1570 %i44 = fadd fast <4 x float> %i43, %vec.phi
1571 %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i44, <4 x float> %vec.phi
1572 %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi199
1573 %i47 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi200
1574 %i48 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi201
1575 %i49 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi202
1576 %i50 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi203
1577 %i51 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi204
1578 %i52 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi205
1579 %index.next = add i32 %index, 4
1580 %i53 = icmp eq i32 %index.next, %n.vec
1581 br i1 %i53, label %middle.block, label %vector.body
1583 middle.block: ; preds = %vector.body
1584 %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i52)
1585 %i55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i51)
1586 %i56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i50)
1587 %i57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i49)
1588 %i58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i48)
1589 %i59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i47)
1590 %i60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46)
1591 %i61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45)
1592 %arrayidx63 = getelementptr inbounds float, ptr %pOut, i32 %k2.0198
1593 store float %i59, ptr %arrayidx63, align 4
1594 %arrayidx65 = getelementptr inbounds float, ptr %pOut, i32 %add
1595 store float %i57, ptr %arrayidx65, align 4
1596 %arrayidx67 = getelementptr inbounds float, ptr %pOut, i32 %add6
1597 store float %i55, ptr %arrayidx67, align 4
1598 %arrayidx69 = getelementptr inbounds float, ptr %pOut, i32 %add8
1599 store float %i54, ptr %arrayidx69, align 4
1600 %arrayidx71 = getelementptr inbounds float, ptr %pOut, i32 %add10
1601 store float %i56, ptr %arrayidx71, align 4
1602 %arrayidx73 = getelementptr inbounds float, ptr %pOut, i32 %add12
1603 store float %i58, ptr %arrayidx73, align 4
1604 %arrayidx75 = getelementptr inbounds float, ptr %pOut, i32 %add14
1605 store float %i60, ptr %arrayidx75, align 4
1606 %arrayidx77 = getelementptr inbounds float, ptr %pOut, i32 %add16
1607 store float %i61, ptr %arrayidx77, align 4
1608 %add79 = add i32 %k2.0198, 8
1609 %cmp3 = icmp ult i32 %add79, %sub
1610 br i1 %cmp3, label %for.body, label %for.cond.cleanup
1613 declare void @llvm.assume(i1 noundef)
1614 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
1615 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
1616 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)