1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
4 ; Tail predicated so we use DLSTP
5 define void @simple(ptr nocapture readonly %x, ptr nocapture readnone %y, ptr nocapture %z, i32 %m, i32 %n) {
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: .save {r7, lr}
9 ; CHECK-NEXT: push {r7, lr}
10 ; CHECK-NEXT: ldr r1, [sp, #8]
11 ; CHECK-NEXT: mov r12, r3
12 ; CHECK-NEXT: adds r3, r1, #3
13 ; CHECK-NEXT: lsrs r3, r3, #2
14 ; CHECK-NEXT: beq .LBB0_3
15 ; CHECK-NEXT: @ %bb.1: @ %do.body.preheader
16 ; CHECK-NEXT: dlstp.32 lr, r1
17 ; CHECK-NEXT: .LBB0_2: @ %do.body
18 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
19 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
20 ; CHECK-NEXT: vaddva.s32 r12, q0
21 ; CHECK-NEXT: letp lr, .LBB0_2
22 ; CHECK-NEXT: .LBB0_3: @ %if.end
23 ; CHECK-NEXT: str.w r12, [r2]
24 ; CHECK-NEXT: pop {r7, pc}
27 %div = lshr i32 %add, 2
28 %cmp.not = icmp eq i32 %div, 0
29 br i1 %cmp.not, label %if.end, label %do.body
31 do.body: ; preds = %entry, %do.body
32 %n.addr.0 = phi i32 [ %sub, %do.body ], [ %n, %entry ]
33 %count.0 = phi i32 [ %sub3, %do.body ], [ %div, %entry ]
34 %s.0 = phi i32 [ %add2, %do.body ], [ %m, %entry ]
35 %x.addr.0 = phi ptr [ %add.ptr, %do.body ], [ %x, %entry ]
36 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
37 %1 = load <4 x i32>, ptr %x.addr.0, align 4
38 %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %1, i32 0, <4 x i1> %0)
39 %add2 = add nsw i32 %2, %s.0
40 %add.ptr = getelementptr inbounds i32, ptr %x.addr.0, i32 4
41 %sub = add i32 %n.addr.0, -4
42 %sub3 = add nsw i32 %count.0, -1
43 %cmp4 = icmp sgt i32 %count.0, 1
44 br i1 %cmp4, label %do.body, label %if.end
46 if.end: ; preds = %do.body, %entry
47 %s.1 = phi i32 [ %m, %entry ], [ %add2, %do.body ]
48 store i32 %s.1, ptr %z, align 4
52 ; Tail predicated so we use DLSTP
53 define void @nested(ptr nocapture readonly %x, ptr nocapture readnone %y, ptr nocapture %z, i32 %m, i32 %n) {
54 ; CHECK-LABEL: nested:
55 ; CHECK: @ %bb.0: @ %entry
56 ; CHECK-NEXT: cmp r3, #0
59 ; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
60 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
61 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
62 ; CHECK-NEXT: ldr.w r12, [sp, #24]
63 ; CHECK-NEXT: movs r1, #0
64 ; CHECK-NEXT: b .LBB1_4
65 ; CHECK-NEXT: .LBB1_2: @ in Loop: Header=BB1_4 Depth=1
66 ; CHECK-NEXT: mov r4, r3
67 ; CHECK-NEXT: .LBB1_3: @ %if.end
68 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
69 ; CHECK-NEXT: str.w r4, [r2, r1, lsl #2]
70 ; CHECK-NEXT: adds r1, #1
71 ; CHECK-NEXT: cmp r1, r3
72 ; CHECK-NEXT: beq .LBB1_8
73 ; CHECK-NEXT: .LBB1_4: @ %for.body
74 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
75 ; CHECK-NEXT: @ Child Loop BB1_6 Depth 2
76 ; CHECK-NEXT: add.w r6, r12, #3
77 ; CHECK-NEXT: lsrs r7, r6, #2
78 ; CHECK-NEXT: beq .LBB1_2
79 ; CHECK-NEXT: @ %bb.5: @ %do.body.preheader
80 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
81 ; CHECK-NEXT: bic r5, r6, #3
82 ; CHECK-NEXT: mov r4, r3
83 ; CHECK-NEXT: add.w r8, r0, r5, lsl #2
84 ; CHECK-NEXT: dlstp.32 lr, r12
85 ; CHECK-NEXT: .LBB1_6: @ %do.body
86 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1
87 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
88 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
89 ; CHECK-NEXT: vaddva.s32 r4, q0
90 ; CHECK-NEXT: letp lr, .LBB1_6
91 ; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit
92 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
93 ; CHECK-NEXT: sub.w r12, r12, r5
94 ; CHECK-NEXT: mov r0, r8
95 ; CHECK-NEXT: b .LBB1_3
96 ; CHECK-NEXT: .LBB1_8:
97 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, lr}
100 %cmp20.not = icmp eq i32 %m, 0
101 br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
103 for.cond.cleanup: ; preds = %if.end, %entry
106 for.body: ; preds = %entry, %if.end
107 %x.addr.023 = phi ptr [ %x.addr.2, %if.end ], [ %x, %entry ]
108 %a.022 = phi i32 [ %inc, %if.end ], [ 0, %entry ]
109 %n.addr.021 = phi i32 [ %n.addr.2, %if.end ], [ %n, %entry ]
110 %add = add i32 %n.addr.021, 3
111 %div = lshr i32 %add, 2
112 %cmp1.not = icmp eq i32 %div, 0
113 br i1 %cmp1.not, label %if.end, label %do.body.preheader
115 do.body.preheader: ; preds = %for.body
116 %0 = and i32 %add, -4
117 %scevgep = getelementptr i32, ptr %x.addr.023, i32 %0
120 do.body: ; preds = %do.body.preheader, %do.body
121 %n.addr.1 = phi i32 [ %sub, %do.body ], [ %n.addr.021, %do.body.preheader ]
122 %count.0 = phi i32 [ %sub4, %do.body ], [ %div, %do.body.preheader ]
123 %s.0 = phi i32 [ %add3, %do.body ], [ %m, %do.body.preheader ]
124 %x.addr.1 = phi ptr [ %add.ptr, %do.body ], [ %x.addr.023, %do.body.preheader ]
125 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.1)
126 %2 = load <4 x i32>, ptr %x.addr.1, align 4
127 %3 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %2, i32 0, <4 x i1> %1)
128 %add3 = add nsw i32 %3, %s.0
129 %add.ptr = getelementptr inbounds i32, ptr %x.addr.1, i32 4
130 %sub = add i32 %n.addr.1, -4
131 %sub4 = add nsw i32 %count.0, -1
132 %cmp5 = icmp sgt i32 %count.0, 1
133 br i1 %cmp5, label %do.body, label %if.end.loopexit
135 if.end.loopexit: ; preds = %do.body
136 %4 = sub i32 %n.addr.021, %0
139 if.end: ; preds = %if.end.loopexit, %for.body
140 %n.addr.2 = phi i32 [ %n.addr.021, %for.body ], [ %4, %if.end.loopexit ]
141 %s.1 = phi i32 [ %m, %for.body ], [ %add3, %if.end.loopexit ]
142 %x.addr.2 = phi ptr [ %x.addr.023, %for.body ], [ %scevgep, %if.end.loopexit ]
143 %arrayidx = getelementptr inbounds i32, ptr %z, i32 %a.022
144 store i32 %s.1, ptr %arrayidx, align 4
145 %inc = add nuw nsw i32 %a.022, 1
146 %exitcond.not = icmp eq i32 %inc, %m
147 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
150 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
151 declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
154 ; Long test that was spilling lr between t2LoopDec and End
155 define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
157 ; CHECK: @ %bb.0: @ %entry
158 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
159 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
160 ; CHECK-NEXT: .setfp r7, sp, #12
161 ; CHECK-NEXT: add r7, sp, #12
162 ; CHECK-NEXT: .save {r8, r9, r10, r11}
163 ; CHECK-NEXT: push.w {r8, r9, r10, r11}
164 ; CHECK-NEXT: .pad #16
165 ; CHECK-NEXT: sub sp, #16
166 ; CHECK-NEXT: wls lr, r1, .LBB2_3
167 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
168 ; CHECK-NEXT: adds r6, r3, #4
169 ; CHECK-NEXT: adds r1, r0, #4
170 ; CHECK-NEXT: mvn r8, #1
171 ; CHECK-NEXT: @ implicit-def: $r9
172 ; CHECK-NEXT: @ implicit-def: $r4
173 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
174 ; CHECK-NEXT: .LBB2_2: @ %while.body
175 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
176 ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
177 ; CHECK-NEXT: asrs r2, r4, #31
178 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
179 ; CHECK-NEXT: ldr r1, [r1]
180 ; CHECK-NEXT: muls r1, r3, r1
181 ; CHECK-NEXT: adds r4, r4, r1
182 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31
183 ; CHECK-NEXT: adds.w r2, r4, #-2147483648
184 ; CHECK-NEXT: ldrd r2, r4, [r8]
185 ; CHECK-NEXT: adc r5, r1, #0
186 ; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
187 ; CHECK-NEXT: smull r4, r2, r4, r9
188 ; CHECK-NEXT: asrs r1, r5, #31
189 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
190 ; CHECK-NEXT: subs r4, r5, r4
191 ; CHECK-NEXT: sbcs r1, r2
192 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
193 ; CHECK-NEXT: adds.w r10, r4, #-2147483648
194 ; CHECK-NEXT: adc r1, r1, #0
195 ; CHECK-NEXT: ldr r4, [r2, #-4]
196 ; CHECK-NEXT: muls r4, r3, r4
197 ; CHECK-NEXT: adds r3, #4
198 ; CHECK-NEXT: adds.w r12, r4, #-2147483648
199 ; CHECK-NEXT: asr.w r5, r4, #31
200 ; CHECK-NEXT: ldr r4, [r6]
201 ; CHECK-NEXT: adc r5, r5, #0
202 ; CHECK-NEXT: mul r2, r4, r0
203 ; CHECK-NEXT: adds r0, #4
204 ; CHECK-NEXT: add.w r2, r2, #-2147483648
205 ; CHECK-NEXT: asrl r12, r5, r2
206 ; CHECK-NEXT: smull r2, r5, r4, r12
207 ; CHECK-NEXT: lsll r2, r5, #30
208 ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
209 ; CHECK-NEXT: asr.w r11, r5, #31
210 ; CHECK-NEXT: mov r12, r5
211 ; CHECK-NEXT: lsll r12, r11, r4
212 ; CHECK-NEXT: mul r2, r2, r9
213 ; CHECK-NEXT: lsrl r12, r11, #2
214 ; CHECK-NEXT: adds r2, #2
215 ; CHECK-NEXT: lsll r12, r11, r2
216 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
217 ; CHECK-NEXT: add.w r5, r12, #-2147483648
218 ; CHECK-NEXT: asrl r10, r1, r5
219 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
220 ; CHECK-NEXT: lsrl r10, r1, #2
221 ; CHECK-NEXT: movs r1, #2
222 ; CHECK-NEXT: mov r9, r10
223 ; CHECK-NEXT: str.w r10, [r1]
224 ; CHECK-NEXT: ldr r1, [r8], #-4
225 ; CHECK-NEXT: mls r5, r1, r4, r5
226 ; CHECK-NEXT: adds.w r4, r5, #-2147483648
227 ; CHECK-NEXT: asr.w r1, r5, #31
228 ; CHECK-NEXT: adc r1, r1, #0
229 ; CHECK-NEXT: lsrl r4, r1, #2
230 ; CHECK-NEXT: rsbs r1, r4, #0
231 ; CHECK-NEXT: str r1, [r2]
232 ; CHECK-NEXT: str r1, [r6, #-4]
233 ; CHECK-NEXT: adds r6, #4
234 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
235 ; CHECK-NEXT: adds r1, #4
236 ; CHECK-NEXT: le lr, .LBB2_2
237 ; CHECK-NEXT: .LBB2_3: @ %while.end
238 ; CHECK-NEXT: add sp, #16
239 ; CHECK-NEXT: pop.w {r8, r9, r10, r11}
240 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
242 %0 = inttoptr i32 %e to ptr
243 %tobool.not70 = icmp eq i32 %d, 0
244 br i1 %tobool.not70, label %while.end, label %while.body
246 while.body: ; preds = %entry, %while.body
247 %p.077 = phi ptr [ %incdec.ptr22, %while.body ], [ inttoptr (i32 2 to ptr), %entry ]
248 %c.addr.076 = phi ptr [ %incdec.ptr1, %while.body ], [ %c, %entry ]
249 %n.075 = phi ptr [ %incdec.ptr43, %while.body ], [ %n, %entry ]
250 %m.074 = phi i32 [ %conv35, %while.body ], [ undef, %entry ]
251 %d.addr.073 = phi i32 [ %dec, %while.body ], [ %d, %entry ]
252 %h.072 = phi i32 [ %conv41, %while.body ], [ undef, %entry ]
253 %incdec.ptr43 = getelementptr inbounds i32, ptr %n.075, i32 1
254 %1 = ptrtoint ptr %n.075 to i32
255 %2 = load i32, ptr %incdec.ptr43, align 4
256 %3 = load i32, ptr %c.addr.076, align 4
257 %mul = mul nsw i32 %3, %1
258 %conv = sext i32 %mul to i64
259 %add = add nsw i64 %conv, 2147483648
260 %incdec.ptr1 = getelementptr inbounds i32, ptr %c.addr.076, i32 1
261 %4 = ptrtoint ptr %c.addr.076 to i32
262 %mul2 = mul nsw i32 %2, %4
263 %conv3 = sext i32 %mul2 to i64
264 %add4 = add nsw i64 %conv3, 2147483648
265 %shr = ashr i64 %add, %add4
266 %5 = shl nuw i64 %shr, 32
267 %conv6 = ashr exact i64 %5, 32
268 %conv7 = sext i32 %2 to i64
269 %conv11 = sext i32 %h.072 to i64
270 %6 = load i32, ptr %incdec.ptr1, align 4
271 %mul12 = mul nsw i32 %6, %1
272 %conv13 = sext i32 %mul12 to i64
273 %add14 = add nuw nsw i64 %conv11, 2147483648
274 %add15 = add nsw i64 %add14, %conv13
275 %shr16 = ashr i64 %add15, 32
276 %conv17 = trunc i64 %shr16 to i32
277 %mul8 = shl nsw i64 %conv7, 30
278 %7 = mul i64 %mul8, %conv6
279 %conv18 = ashr i64 %7, 32
280 %sh_prom = zext i32 %2 to i64
281 %shl = shl i64 %conv18, %sh_prom
282 %conv21 = sext i32 %conv17 to i64
283 %incdec.ptr22 = getelementptr inbounds i32, ptr %p.077, i32 -1
284 %8 = load i32, ptr %p.077, align 4
285 %conv23 = sext i32 %8 to i64
286 %conv24 = sext i32 %m.074 to i64
287 %mul25 = mul nsw i64 %conv23, %conv24
288 %sub = sub nsw i64 2147483648, %mul25
289 %add26 = add nsw i64 %sub, %conv21
290 %9 = shl i64 %shl, 30
291 %conv27 = ashr i64 %9, 32
292 %10 = load i32, ptr %incdec.ptr22, align 4
293 %mul28 = mul nsw i32 %10, %m.074
294 %add29 = add nsw i32 %mul28, 2
295 %sh_prom30 = zext i32 %add29 to i64
296 %shl31 = shl i64 %conv27, %sh_prom30
297 %add32 = add nsw i64 %shl31, 2147483648
298 %shr33 = ashr i64 %add26, %add32
299 %11 = lshr i64 %shr33, 2
300 %conv35 = trunc i64 %11 to i32
301 store i32 %conv35, ptr inttoptr (i32 2 to ptr), align 4
302 %12 = load i32, ptr %incdec.ptr22, align 4
303 %mul36 = mul nsw i32 %12, %2
304 %sub37 = sub nsw i32 %conv17, %mul36
305 %conv38 = sext i32 %sub37 to i64
306 %add39 = add nsw i64 %conv38, 2147483648
307 %13 = lshr i64 %add39, 2
308 %conv41 = trunc i64 %13 to i32
309 %sub42 = sub nsw i32 0, %conv41
310 store i32 %sub42, ptr %0, align 4
311 store i32 %sub42, ptr %n.075, align 4
312 %dec = add nsw i32 %d.addr.073, -1
313 %tobool.not = icmp eq i32 %dec, 0
314 br i1 %tobool.not, label %while.end, label %while.body
316 while.end: ; preds = %while.body, %entry
320 declare void @callee()
321 define void @callinpreheader(ptr noalias nocapture readonly %pAngle, ptr nocapture %pDst, i32 %size) {
322 ; CHECK-LABEL: callinpreheader:
323 ; CHECK: @ %bb.0: @ %entry
324 ; CHECK-NEXT: .save {r4, r5, r6, lr}
325 ; CHECK-NEXT: push {r4, r5, r6, lr}
326 ; CHECK-NEXT: mov r5, r0
327 ; CHECK-NEXT: mov r4, r1
328 ; CHECK-NEXT: movs r0, #0
329 ; CHECK-NEXT: cbz r2, .LBB3_3
330 ; CHECK-NEXT: @ %bb.1: @ %for.body.ph
331 ; CHECK-NEXT: mov r6, r2
332 ; CHECK-NEXT: bl callee
333 ; CHECK-NEXT: dls lr, r6
334 ; CHECK-NEXT: movs r0, #0
335 ; CHECK-NEXT: .LBB3_2: @ %for.body
336 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
337 ; CHECK-NEXT: ldr r1, [r5], #4
338 ; CHECK-NEXT: add r0, r1
339 ; CHECK-NEXT: le lr, .LBB3_2
340 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
341 ; CHECK-NEXT: str r0, [r4]
342 ; CHECK-NEXT: pop {r4, r5, r6, pc}
344 %cmp7.not = icmp eq i32 %size, 0
345 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body.ph
352 %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.ph ]
353 %s.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.ph ]
354 %arrayidx = getelementptr inbounds i32, ptr %pAngle, i32 %i.09
355 %0 = load i32, ptr %arrayidx, align 4
356 %add = add nsw i32 %0, %s.08
357 %inc = add nuw nsw i32 %i.09, 1
358 %exitcond.not = icmp eq i32 %inc, %size
359 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
362 %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
363 store i32 %s.0.lcssa, ptr %pDst, align 4