1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
4 define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
5 ; CHECK-LABEL: mul_reduce_add:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: cmp r2, #0
9 ; CHECK-NEXT: moveq r0, #0
11 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
12 ; CHECK-NEXT: push {r7, lr}
13 ; CHECK-NEXT: adds r3, r2, #3
14 ; CHECK-NEXT: vmov.i32 q1, #0x0
15 ; CHECK-NEXT: bic r3, r3, #3
16 ; CHECK-NEXT: sub.w r12, r3, #4
17 ; CHECK-NEXT: movs r3, #1
18 ; CHECK-NEXT: add.w r3, r3, r12, lsr #2
19 ; CHECK-NEXT: dls lr, r3
20 ; CHECK-NEXT: .LBB0_2: @ %vector.body
21 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
22 ; CHECK-NEXT: vctp.32 r2
23 ; CHECK-NEXT: vmov q0, q1
25 ; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
26 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
27 ; CHECK-NEXT: subs r2, #4
28 ; CHECK-NEXT: vmul.i32 q1, q2, q1
29 ; CHECK-NEXT: vadd.i32 q1, q1, q0
30 ; CHECK-NEXT: le lr, .LBB0_2
31 ; CHECK-NEXT: @ %bb.3: @ %middle.block
32 ; CHECK-NEXT: vpsel q0, q1, q0
33 ; CHECK-NEXT: vaddv.u32 r0, q0
34 ; CHECK-NEXT: pop {r7, pc}
36 %cmp8 = icmp eq i32 %N, 0
37 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
39 vector.ph: ; preds = %entry
40 %n.rnd.up = add i32 %N, 3
41 %n.vec = and i32 %n.rnd.up, -4
44 vector.body: ; preds = %vector.body, %vector.ph
45 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
46 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
47 %0 = getelementptr inbounds i32, i32* %a, i32 %index
48 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
49 %2 = bitcast i32* %0 to <4 x i32>*
50 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
51 %3 = getelementptr inbounds i32, i32* %b, i32 %index
52 %4 = bitcast i32* %3 to <4 x i32>*
53 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
54 %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
55 %6 = add nsw <4 x i32> %5, %vec.phi
56 %index.next = add i32 %index, 4
57 %7 = icmp eq i32 %index.next, %n.vec
58 br i1 %7, label %middle.block, label %vector.body
60 middle.block: ; preds = %vector.body
61 %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
62 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
63 br label %for.cond.cleanup
65 for.cond.cleanup: ; preds = %middle.block, %entry
66 %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
70 define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
71 ; CHECK-LABEL: mul_reduce_add_const:
72 ; CHECK: @ %bb.0: @ %entry
73 ; CHECK-NEXT: cmp r2, #0
75 ; CHECK-NEXT: moveq r0, #0
77 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
78 ; CHECK-NEXT: push {r7, lr}
79 ; CHECK-NEXT: adds r1, r2, #3
80 ; CHECK-NEXT: movs r3, #1
81 ; CHECK-NEXT: bic r1, r1, #3
82 ; CHECK-NEXT: vmov.i32 q0, #0x0
83 ; CHECK-NEXT: subs r1, #4
84 ; CHECK-NEXT: add.w r1, r3, r1, lsr #2
85 ; CHECK-NEXT: dls lr, r1
86 ; CHECK-NEXT: .LBB1_2: @ %vector.body
87 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
88 ; CHECK-NEXT: vctp.32 r2
89 ; CHECK-NEXT: vmov q1, q0
91 ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
92 ; CHECK-NEXT: subs r2, #4
93 ; CHECK-NEXT: vadd.i32 q0, q0, q1
94 ; CHECK-NEXT: le lr, .LBB1_2
95 ; CHECK-NEXT: @ %bb.3: @ %middle.block
96 ; CHECK-NEXT: vpsel q0, q0, q1
97 ; CHECK-NEXT: vaddv.u32 r0, q0
98 ; CHECK-NEXT: pop {r7, pc}
100 %cmp6 = icmp eq i32 %N, 0
101 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
103 vector.ph: ; preds = %entry
104 %n.rnd.up = add i32 %N, 3
105 %n.vec = and i32 %n.rnd.up, -4
106 br label %vector.body
108 vector.body: ; preds = %vector.body, %vector.ph
109 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
110 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
111 %0 = getelementptr inbounds i32, i32* %a, i32 %index
112 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
113 %2 = bitcast i32* %0 to <4 x i32>*
114 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
115 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
116 %index.next = add i32 %index, 4
117 %4 = icmp eq i32 %index.next, %n.vec
118 br i1 %4, label %middle.block, label %vector.body
120 middle.block: ; preds = %vector.body
121 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
122 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
123 br label %for.cond.cleanup
125 for.cond.cleanup: ; preds = %middle.block, %entry
126 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
130 define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
131 ; CHECK-LABEL: add_reduce_add_const:
132 ; CHECK: @ %bb.0: @ %entry
133 ; CHECK-NEXT: cmp r2, #0
135 ; CHECK-NEXT: moveq r0, #0
136 ; CHECK-NEXT: bxeq lr
137 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
138 ; CHECK-NEXT: push {r7, lr}
139 ; CHECK-NEXT: adds r1, r2, #3
140 ; CHECK-NEXT: movs r3, #1
141 ; CHECK-NEXT: bic r1, r1, #3
142 ; CHECK-NEXT: vmov.i32 q0, #0x0
143 ; CHECK-NEXT: subs r1, #4
144 ; CHECK-NEXT: add.w r1, r3, r1, lsr #2
145 ; CHECK-NEXT: dls lr, r1
146 ; CHECK-NEXT: .LBB2_2: @ %vector.body
147 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
148 ; CHECK-NEXT: vctp.32 r2
149 ; CHECK-NEXT: vmov q1, q0
151 ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
152 ; CHECK-NEXT: subs r2, #4
153 ; CHECK-NEXT: vadd.i32 q0, q0, q1
154 ; CHECK-NEXT: le lr, .LBB2_2
155 ; CHECK-NEXT: @ %bb.3: @ %middle.block
156 ; CHECK-NEXT: vpsel q0, q0, q1
157 ; CHECK-NEXT: vaddv.u32 r0, q0
158 ; CHECK-NEXT: pop {r7, pc}
160 %cmp6 = icmp eq i32 %N, 0
161 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
163 vector.ph: ; preds = %entry
164 %n.rnd.up = add i32 %N, 3
165 %n.vec = and i32 %n.rnd.up, -4
166 br label %vector.body
168 vector.body: ; preds = %vector.body, %vector.ph
169 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
170 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
171 %0 = getelementptr inbounds i32, i32* %a, i32 %index
172 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
173 %2 = bitcast i32* %0 to <4 x i32>*
174 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
175 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
176 %index.next = add i32 %index, 4
177 %4 = icmp eq i32 %index.next, %n.vec
178 br i1 %4, label %middle.block, label %vector.body
180 middle.block: ; preds = %vector.body
181 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
182 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
183 br label %for.cond.cleanup
185 for.cond.cleanup: ; preds = %middle.block, %entry
186 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
190 define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
191 ; CHECK-LABEL: vector_mul_const:
192 ; CHECK: @ %bb.0: @ %entry
193 ; CHECK-NEXT: push {r7, lr}
194 ; CHECK-NEXT: cmp r3, #0
196 ; CHECK-NEXT: popeq {r7, pc}
197 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
198 ; CHECK-NEXT: dlstp.32 lr, r3
199 ; CHECK-NEXT: .LBB3_2: @ %vector.body
200 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
201 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
202 ; CHECK-NEXT: vmul.i32 q0, q0, r2
203 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
204 ; CHECK-NEXT: letp lr, .LBB3_2
205 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
206 ; CHECK-NEXT: pop {r7, pc}
208 %cmp6 = icmp eq i32 %N, 0
209 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
211 vector.ph: ; preds = %entry
212 %n.rnd.up = add i32 %N, 3
213 %n.vec = and i32 %n.rnd.up, -4
214 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
215 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
216 br label %vector.body
218 vector.body: ; preds = %vector.body, %vector.ph
219 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
220 %0 = getelementptr inbounds i32, i32* %b, i32 %index
221 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
222 %2 = bitcast i32* %0 to <4 x i32>*
223 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
224 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
225 %4 = getelementptr inbounds i32, i32* %a, i32 %index
226 %5 = bitcast i32* %4 to <4 x i32>*
227 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
228 %index.next = add i32 %index, 4
229 %6 = icmp eq i32 %index.next, %n.vec
230 br i1 %6, label %for.cond.cleanup, label %vector.body
232 for.cond.cleanup: ; preds = %vector.body, %entry
236 define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
237 ; CHECK-LABEL: vector_add_const:
238 ; CHECK: @ %bb.0: @ %entry
239 ; CHECK-NEXT: push {r7, lr}
240 ; CHECK-NEXT: cmp r3, #0
242 ; CHECK-NEXT: popeq {r7, pc}
243 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
244 ; CHECK-NEXT: dlstp.32 lr, r3
245 ; CHECK-NEXT: .LBB4_2: @ %vector.body
246 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
247 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
248 ; CHECK-NEXT: vadd.i32 q0, q0, r2
249 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
250 ; CHECK-NEXT: letp lr, .LBB4_2
251 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
252 ; CHECK-NEXT: pop {r7, pc}
254 %cmp6 = icmp eq i32 %N, 0
255 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
257 vector.ph: ; preds = %entry
258 %n.rnd.up = add i32 %N, 3
259 %n.vec = and i32 %n.rnd.up, -4
260 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
261 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
262 br label %vector.body
264 vector.body: ; preds = %vector.body, %vector.ph
265 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
266 %0 = getelementptr inbounds i32, i32* %b, i32 %index
267 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
268 %2 = bitcast i32* %0 to <4 x i32>*
269 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
270 %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
271 %4 = getelementptr inbounds i32, i32* %a, i32 %index
272 %5 = bitcast i32* %4 to <4 x i32>*
273 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
274 %index.next = add i32 %index, 4
275 %6 = icmp eq i32 %index.next, %n.vec
276 br i1 %6, label %for.cond.cleanup, label %vector.body
278 for.cond.cleanup: ; preds = %vector.body, %entry
282 define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) {
283 ; CHECK-LABEL: vector_mul_vector_i8:
284 ; CHECK: @ %bb.0: @ %entry
285 ; CHECK-NEXT: push {r7, lr}
286 ; CHECK-NEXT: cmp r3, #0
288 ; CHECK-NEXT: popeq {r7, pc}
289 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
290 ; CHECK-NEXT: dlstp.8 lr, r3
291 ; CHECK-NEXT: .LBB5_2: @ %vector.body
292 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
293 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
294 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16
295 ; CHECK-NEXT: vmul.i8 q0, q1, q0
296 ; CHECK-NEXT: vstrb.8 q0, [r0], #16
297 ; CHECK-NEXT: letp lr, .LBB5_2
298 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
299 ; CHECK-NEXT: pop {r7, pc}
301 %cmp10 = icmp eq i32 %N, 0
302 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
304 vector.ph: ; preds = %entry
305 %n.rnd.up = add i32 %N, 15
306 %n.vec = and i32 %n.rnd.up, -16
307 br label %vector.body
309 vector.body: ; preds = %vector.body, %vector.ph
310 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
311 %0 = getelementptr inbounds i8, i8* %b, i32 %index
312 %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
313 %2 = bitcast i8* %0 to <16 x i8>*
314 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
315 %3 = getelementptr inbounds i8, i8* %c, i32 %index
316 %4 = bitcast i8* %3 to <16 x i8>*
317 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef)
318 %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load
319 %6 = getelementptr inbounds i8, i8* %a, i32 %index
320 %7 = bitcast i8* %6 to <16 x i8>*
321 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1)
322 %index.next = add i32 %index, 16
323 %8 = icmp eq i32 %index.next, %n.vec
324 br i1 %8, label %for.cond.cleanup, label %vector.body
326 for.cond.cleanup: ; preds = %vector.body, %entry
330 ; Function Attrs: nofree norecurse nounwind
331 define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
332 ; CHECK-LABEL: vector_mul_vector_i16:
333 ; CHECK: @ %bb.0: @ %entry
334 ; CHECK-NEXT: push {r7, lr}
335 ; CHECK-NEXT: cmp r3, #0
337 ; CHECK-NEXT: popeq {r7, pc}
338 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
339 ; CHECK-NEXT: dlstp.16 lr, r3
340 ; CHECK-NEXT: .LBB6_2: @ %vector.body
341 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
342 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
343 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16
344 ; CHECK-NEXT: vmul.i16 q0, q1, q0
345 ; CHECK-NEXT: vstrh.16 q0, [r0], #16
346 ; CHECK-NEXT: letp lr, .LBB6_2
347 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
348 ; CHECK-NEXT: pop {r7, pc}
350 %cmp10 = icmp eq i32 %N, 0
351 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
353 vector.ph: ; preds = %entry
354 %n.rnd.up = add i32 %N, 7
355 %n.vec = and i32 %n.rnd.up, -8
356 br label %vector.body
358 vector.body: ; preds = %vector.body, %vector.ph
359 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
360 %0 = getelementptr inbounds i16, i16* %b, i32 %index
361 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
362 %2 = bitcast i16* %0 to <8 x i16>*
363 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
364 %3 = getelementptr inbounds i16, i16* %c, i32 %index
365 %4 = bitcast i16* %3 to <8 x i16>*
366 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef)
367 %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load
368 %6 = getelementptr inbounds i16, i16* %a, i32 %index
369 %7 = bitcast i16* %6 to <8 x i16>*
370 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1)
371 %index.next = add i32 %index, 8
372 %8 = icmp eq i32 %index.next, %n.vec
373 br i1 %8, label %for.cond.cleanup, label %vector.body
375 for.cond.cleanup: ; preds = %vector.body, %entry
379 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
380 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
381 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
382 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
383 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
384 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
385 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
386 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
387 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
388 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)