1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
4 define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
5 ; CHECK-LABEL: one_loop_add_add_v16i8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: push {r7, lr}
8 ; CHECK-NEXT: cbz r2, .LBB0_4
9 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
10 ; CHECK-NEXT: vmov.i32 q0, #0x0
11 ; CHECK-NEXT: dlstp.8 lr, r2
12 ; CHECK-NEXT: .LBB0_2: @ %vector.body
13 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
14 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
15 ; CHECK-NEXT: vldrb.u8 q2, [r0], #16
16 ; CHECK-NEXT: vadd.i8 q0, q2, q1
17 ; CHECK-NEXT: vaddv.u8 r12, q0
18 ; CHECK-NEXT: letp lr, .LBB0_2
19 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
20 ; CHECK-NEXT: uxtb.w r0, r12
21 ; CHECK-NEXT: pop {r7, pc}
22 ; CHECK-NEXT: .LBB0_4:
23 ; CHECK-NEXT: mov.w r12, #0
24 ; CHECK-NEXT: uxtb.w r0, r12
25 ; CHECK-NEXT: pop {r7, pc}
27 %cmp11 = icmp eq i32 %N, 0
28 br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
30 vector.ph: ; preds = %entry
31 %n.rnd.up = add i32 %N, 15
32 %n.vec = and i32 %n.rnd.up, -16
35 vector.body: ; preds = %vector.body, %vector.ph
36 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37 %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
38 %i = getelementptr inbounds i8, ptr %a, i32 %index
39 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
40 %i1 = bitcast ptr %i to ptr
41 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
42 %i2 = getelementptr inbounds i8, ptr %b, i32 %index
43 %i3 = bitcast ptr %i2 to ptr
44 %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
45 %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
46 %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
47 %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5)
48 %index.next = add i32 %index, 16
49 %i7 = icmp eq i32 %index.next, %n.vec
50 br i1 %i7, label %middle.block, label %vector.body
52 middle.block: ; preds = %vector.body
53 br label %for.cond.cleanup
55 for.cond.cleanup: ; preds = %middle.block, %entry
56 %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ]
60 define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
61 ; CHECK-LABEL: one_loop_add_add_v8i16:
62 ; CHECK: @ %bb.0: @ %entry
63 ; CHECK-NEXT: cmp r2, #0
65 ; CHECK-NEXT: moveq r0, #0
66 ; CHECK-NEXT: sxtheq r0, r0
68 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
69 ; CHECK-NEXT: push {r7, lr}
70 ; CHECK-NEXT: adds r3, r2, #7
71 ; CHECK-NEXT: vmov.i32 q1, #0x0
72 ; CHECK-NEXT: bic r3, r3, #7
73 ; CHECK-NEXT: sub.w r12, r3, #8
74 ; CHECK-NEXT: movs r3, #1
75 ; CHECK-NEXT: add.w r3, r3, r12, lsr #3
76 ; CHECK-NEXT: dls lr, r3
77 ; CHECK-NEXT: .LBB1_2: @ %vector.body
78 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
79 ; CHECK-NEXT: vctp.16 r2
80 ; CHECK-NEXT: vmov q0, q1
82 ; CHECK-NEXT: vldrbt.u16 q1, [r0], #8
83 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8
84 ; CHECK-NEXT: subs r2, #8
85 ; CHECK-NEXT: vadd.i16 q1, q0, q1
86 ; CHECK-NEXT: vadd.i16 q1, q1, q2
87 ; CHECK-NEXT: le lr, .LBB1_2
88 ; CHECK-NEXT: @ %bb.3: @ %middle.block
89 ; CHECK-NEXT: vpsel q0, q1, q0
90 ; CHECK-NEXT: vaddv.u16 r0, q0
91 ; CHECK-NEXT: pop.w {r7, lr}
92 ; CHECK-NEXT: sxth r0, r0
95 %cmp12 = icmp eq i32 %N, 0
96 br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
98 vector.ph: ; preds = %entry
99 %n.rnd.up = add i32 %N, 7
100 %n.vec = and i32 %n.rnd.up, -8
101 br label %vector.body
103 vector.body: ; preds = %vector.body, %vector.ph
104 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
105 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
106 %i = getelementptr inbounds i8, ptr %a, i32 %index
107 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
108 %i1 = bitcast ptr %i to ptr
109 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
110 %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
111 %i3 = getelementptr inbounds i8, ptr %b, i32 %index
112 %i4 = bitcast ptr %i3 to ptr
113 %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
114 %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
115 %i6 = add <8 x i16> %vec.phi, %i2
116 %i7 = add <8 x i16> %i6, %i5
117 %index.next = add i32 %index, 8
118 %i8 = icmp eq i32 %index.next, %n.vec
119 br i1 %i8, label %middle.block, label %vector.body
121 middle.block: ; preds = %vector.body
122 %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
123 %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
124 br label %for.cond.cleanup
126 for.cond.cleanup: ; preds = %middle.block, %entry
127 %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
131 define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
132 ; CHECK-LABEL: one_loop_sub_add_v16i8:
133 ; CHECK: @ %bb.0: @ %entry
134 ; CHECK-NEXT: cmp r2, #0
135 ; CHECK-NEXT: ittt eq
136 ; CHECK-NEXT: moveq r0, #0
137 ; CHECK-NEXT: uxtbeq r0, r0
138 ; CHECK-NEXT: bxeq lr
139 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
140 ; CHECK-NEXT: push {r7, lr}
141 ; CHECK-NEXT: add.w r3, r2, #15
142 ; CHECK-NEXT: vmov.i32 q1, #0x0
143 ; CHECK-NEXT: bic r3, r3, #15
144 ; CHECK-NEXT: sub.w r12, r3, #16
145 ; CHECK-NEXT: movs r3, #1
146 ; CHECK-NEXT: add.w r3, r3, r12, lsr #4
147 ; CHECK-NEXT: dls lr, r3
148 ; CHECK-NEXT: .LBB2_2: @ %vector.body
149 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
150 ; CHECK-NEXT: vctp.8 r2
151 ; CHECK-NEXT: vmov q0, q1
153 ; CHECK-NEXT: vldrbt.u8 q1, [r1], #16
154 ; CHECK-NEXT: vldrbt.u8 q2, [r0], #16
155 ; CHECK-NEXT: subs r2, #16
156 ; CHECK-NEXT: vsub.i8 q1, q2, q1
157 ; CHECK-NEXT: vadd.i8 q1, q1, q0
158 ; CHECK-NEXT: le lr, .LBB2_2
159 ; CHECK-NEXT: @ %bb.3: @ %middle.block
160 ; CHECK-NEXT: vpsel q0, q1, q0
161 ; CHECK-NEXT: vaddv.u8 r0, q0
162 ; CHECK-NEXT: pop.w {r7, lr}
163 ; CHECK-NEXT: uxtb r0, r0
166 %cmp11 = icmp eq i32 %N, 0
167 br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
169 vector.ph: ; preds = %entry
170 %n.rnd.up = add i32 %N, 15
171 %n.vec = and i32 %n.rnd.up, -16
172 br label %vector.body
174 vector.body: ; preds = %vector.body, %vector.ph
175 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
176 %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
177 %i = getelementptr inbounds i8, ptr %a, i32 %index
178 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
179 %i1 = bitcast ptr %i to ptr
180 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
181 %i2 = getelementptr inbounds i8, ptr %b, i32 %index
182 %i3 = bitcast ptr %i2 to ptr
183 %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
184 %i4 = sub <16 x i8> %wide.masked.load, %wide.masked.load16
185 %i5 = add <16 x i8> %i4, %vec.phi
186 %index.next = add i32 %index, 16
187 %i6 = icmp eq i32 %index.next, %n.vec
188 br i1 %i6, label %middle.block, label %vector.body
190 middle.block: ; preds = %vector.body
191 %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
192 %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
193 br label %for.cond.cleanup
195 for.cond.cleanup: ; preds = %middle.block, %entry
196 %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
200 define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
201 ; CHECK-LABEL: one_loop_sub_add_v8i16:
202 ; CHECK: @ %bb.0: @ %entry
203 ; CHECK-NEXT: cmp r2, #0
204 ; CHECK-NEXT: ittt eq
205 ; CHECK-NEXT: moveq r0, #0
206 ; CHECK-NEXT: sxtheq r0, r0
207 ; CHECK-NEXT: bxeq lr
208 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
209 ; CHECK-NEXT: push {r7, lr}
210 ; CHECK-NEXT: adds r3, r2, #7
211 ; CHECK-NEXT: vmov.i32 q1, #0x0
212 ; CHECK-NEXT: bic r3, r3, #7
213 ; CHECK-NEXT: sub.w r12, r3, #8
214 ; CHECK-NEXT: movs r3, #1
215 ; CHECK-NEXT: add.w r3, r3, r12, lsr #3
216 ; CHECK-NEXT: dls lr, r3
217 ; CHECK-NEXT: .LBB3_2: @ %vector.body
218 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
219 ; CHECK-NEXT: vctp.16 r2
220 ; CHECK-NEXT: vmov q0, q1
222 ; CHECK-NEXT: vldrbt.u16 q1, [r0], #8
223 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8
224 ; CHECK-NEXT: subs r2, #8
225 ; CHECK-NEXT: vsub.i16 q1, q2, q1
226 ; CHECK-NEXT: vadd.i16 q1, q1, q0
227 ; CHECK-NEXT: le lr, .LBB3_2
228 ; CHECK-NEXT: @ %bb.3: @ %middle.block
229 ; CHECK-NEXT: vpsel q0, q1, q0
230 ; CHECK-NEXT: vaddv.u16 r0, q0
231 ; CHECK-NEXT: pop.w {r7, lr}
232 ; CHECK-NEXT: sxth r0, r0
235 %cmp12 = icmp eq i32 %N, 0
236 br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
238 vector.ph: ; preds = %entry
239 %n.rnd.up = add i32 %N, 7
240 %n.vec = and i32 %n.rnd.up, -8
241 br label %vector.body
243 vector.body: ; preds = %vector.body, %vector.ph
244 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
245 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
246 %i = getelementptr inbounds i8, ptr %a, i32 %index
247 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
248 %i1 = bitcast ptr %i to ptr
249 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
250 %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
251 %i3 = getelementptr inbounds i8, ptr %b, i32 %index
252 %i4 = bitcast ptr %i3 to ptr
253 %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
254 %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
255 %i6 = sub <8 x i16> %i5, %i2
256 %i7 = add <8 x i16> %i6, %vec.phi
257 %index.next = add i32 %index, 8
258 %i8 = icmp eq i32 %index.next, %n.vec
259 br i1 %i8, label %middle.block, label %vector.body
261 middle.block: ; preds = %vector.body
262 %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
263 %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
264 br label %for.cond.cleanup
266 for.cond.cleanup: ; preds = %middle.block, %entry
267 %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
271 define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
272 ; CHECK-LABEL: one_loop_mul_add_v16i8:
273 ; CHECK: @ %bb.0: @ %entry
274 ; CHECK-NEXT: cmp r2, #0
275 ; CHECK-NEXT: ittt eq
276 ; CHECK-NEXT: moveq r0, #0
277 ; CHECK-NEXT: uxtbeq r0, r0
278 ; CHECK-NEXT: bxeq lr
279 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
280 ; CHECK-NEXT: push {r7, lr}
281 ; CHECK-NEXT: add.w r3, r2, #15
282 ; CHECK-NEXT: vmov.i32 q1, #0x0
283 ; CHECK-NEXT: bic r3, r3, #15
284 ; CHECK-NEXT: sub.w r12, r3, #16
285 ; CHECK-NEXT: movs r3, #1
286 ; CHECK-NEXT: add.w r3, r3, r12, lsr #4
287 ; CHECK-NEXT: dls lr, r3
288 ; CHECK-NEXT: .LBB4_2: @ %vector.body
289 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
290 ; CHECK-NEXT: vctp.8 r2
291 ; CHECK-NEXT: vmov q0, q1
293 ; CHECK-NEXT: vldrbt.u8 q1, [r0], #16
294 ; CHECK-NEXT: vldrbt.u8 q2, [r1], #16
295 ; CHECK-NEXT: subs r2, #16
296 ; CHECK-NEXT: vmul.i8 q1, q2, q1
297 ; CHECK-NEXT: vadd.i8 q1, q1, q0
298 ; CHECK-NEXT: le lr, .LBB4_2
299 ; CHECK-NEXT: @ %bb.3: @ %middle.block
300 ; CHECK-NEXT: vpsel q0, q1, q0
301 ; CHECK-NEXT: vaddv.u8 r0, q0
302 ; CHECK-NEXT: pop.w {r7, lr}
303 ; CHECK-NEXT: uxtb r0, r0
306 %cmp10 = icmp eq i32 %N, 0
307 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
309 vector.ph: ; preds = %entry
310 %n.rnd.up = add i32 %N, 15
311 %n.vec = and i32 %n.rnd.up, -16
312 br label %vector.body
314 vector.body: ; preds = %vector.body, %vector.ph
315 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
316 %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
317 %i = getelementptr inbounds i8, ptr %a, i32 %index
318 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
319 %i1 = bitcast ptr %i to ptr
320 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
321 %i2 = getelementptr inbounds i8, ptr %b, i32 %index
322 %i3 = bitcast ptr %i2 to ptr
323 %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
324 %i4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
325 %i5 = add <16 x i8> %i4, %vec.phi
326 %index.next = add i32 %index, 16
327 %i6 = icmp eq i32 %index.next, %n.vec
328 br i1 %i6, label %middle.block, label %vector.body
330 middle.block: ; preds = %vector.body
331 %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
332 %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
333 br label %for.cond.cleanup
335 for.cond.cleanup: ; preds = %middle.block, %entry
336 %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
340 define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
341 ; CHECK-LABEL: one_loop_mul_add_v8i16:
342 ; CHECK: @ %bb.0: @ %entry
343 ; CHECK-NEXT: cmp r2, #0
344 ; CHECK-NEXT: ittt eq
345 ; CHECK-NEXT: moveq r0, #0
346 ; CHECK-NEXT: sxtheq r0, r0
347 ; CHECK-NEXT: bxeq lr
348 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
349 ; CHECK-NEXT: push {r7, lr}
350 ; CHECK-NEXT: adds r3, r2, #7
351 ; CHECK-NEXT: vmov.i32 q1, #0x0
352 ; CHECK-NEXT: bic r3, r3, #7
353 ; CHECK-NEXT: sub.w r12, r3, #8
354 ; CHECK-NEXT: movs r3, #1
355 ; CHECK-NEXT: add.w r3, r3, r12, lsr #3
356 ; CHECK-NEXT: dls lr, r3
357 ; CHECK-NEXT: .LBB5_2: @ %vector.body
358 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
359 ; CHECK-NEXT: vctp.16 r2
360 ; CHECK-NEXT: vmov q0, q1
362 ; CHECK-NEXT: vldrbt.u16 q1, [r0], #8
363 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8
364 ; CHECK-NEXT: subs r2, #8
365 ; CHECK-NEXT: vmul.i16 q1, q2, q1
366 ; CHECK-NEXT: vadd.i16 q1, q1, q0
367 ; CHECK-NEXT: le lr, .LBB5_2
368 ; CHECK-NEXT: @ %bb.3: @ %middle.block
369 ; CHECK-NEXT: vpsel q0, q1, q0
370 ; CHECK-NEXT: vaddv.u16 r0, q0
371 ; CHECK-NEXT: pop.w {r7, lr}
372 ; CHECK-NEXT: sxth r0, r0
375 %cmp12 = icmp eq i32 %N, 0
376 br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
378 vector.ph: ; preds = %entry
379 %n.rnd.up = add i32 %N, 7
380 %n.vec = and i32 %n.rnd.up, -8
381 br label %vector.body
383 vector.body: ; preds = %vector.body, %vector.ph
384 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
385 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
386 %i = getelementptr inbounds i8, ptr %a, i32 %index
387 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
388 %i1 = bitcast ptr %i to ptr
389 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
390 %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
391 %i3 = getelementptr inbounds i8, ptr %b, i32 %index
392 %i4 = bitcast ptr %i3 to ptr
393 %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
394 %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
395 %i6 = mul <8 x i16> %i5, %i2
396 %i7 = add <8 x i16> %i6, %vec.phi
397 %index.next = add i32 %index, 8
398 %i8 = icmp eq i32 %index.next, %n.vec
399 br i1 %i8, label %middle.block, label %vector.body
401 middle.block: ; preds = %vector.body
402 %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
403 %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
404 br label %for.cond.cleanup
406 for.cond.cleanup: ; preds = %middle.block, %entry
407 %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
411 define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
412 ; CHECK-LABEL: two_loops_mul_add_v4i32:
413 ; CHECK: @ %bb.0: @ %entry
414 ; CHECK-NEXT: cmp r2, #0
416 ; CHECK-NEXT: moveq r0, #0
417 ; CHECK-NEXT: bxeq lr
418 ; CHECK-NEXT: .LBB6_1: @ %vector.ph
419 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
420 ; CHECK-NEXT: adds r3, r2, #3
421 ; CHECK-NEXT: vmov.i32 q1, #0x0
422 ; CHECK-NEXT: bic r3, r3, #3
423 ; CHECK-NEXT: mov r4, r0
424 ; CHECK-NEXT: subs r7, r3, #4
425 ; CHECK-NEXT: movs r3, #1
426 ; CHECK-NEXT: mov r5, r1
427 ; CHECK-NEXT: add.w r6, r3, r7, lsr #2
428 ; CHECK-NEXT: mov r3, r2
429 ; CHECK-NEXT: dls lr, r6
430 ; CHECK-NEXT: .LBB6_2: @ %vector.body
431 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
432 ; CHECK-NEXT: vctp.32 r3
433 ; CHECK-NEXT: vmov q0, q1
435 ; CHECK-NEXT: vldrbt.u32 q1, [r4], #4
436 ; CHECK-NEXT: vldrbt.u32 q2, [r5], #4
437 ; CHECK-NEXT: subs r3, #4
438 ; CHECK-NEXT: vmul.i32 q1, q2, q1
439 ; CHECK-NEXT: vadd.i32 q1, q1, q0
440 ; CHECK-NEXT: le lr, .LBB6_2
441 ; CHECK-NEXT: @ %bb.3: @ %middle.block
442 ; CHECK-NEXT: vpsel q0, q1, q0
443 ; CHECK-NEXT: vaddv.u32 r12, q0
444 ; CHECK-NEXT: cbz r2, .LBB6_7
445 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47
446 ; CHECK-NEXT: movs r3, #0
447 ; CHECK-NEXT: vdup.32 q0, r3
448 ; CHECK-NEXT: movs r3, #1
449 ; CHECK-NEXT: add.w r3, r3, r7, lsr #2
450 ; CHECK-NEXT: vmov.32 q0[0], r12
451 ; CHECK-NEXT: dls lr, r3
452 ; CHECK-NEXT: .LBB6_5: @ %vector.body46
453 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
454 ; CHECK-NEXT: vctp.32 r2
455 ; CHECK-NEXT: vmov q1, q0
457 ; CHECK-NEXT: vldrbt.u32 q0, [r0], #4
458 ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4
459 ; CHECK-NEXT: subs r2, #4
460 ; CHECK-NEXT: vmul.i32 q0, q2, q0
461 ; CHECK-NEXT: vadd.i32 q0, q0, q1
462 ; CHECK-NEXT: le lr, .LBB6_5
463 ; CHECK-NEXT: @ %bb.6: @ %middle.block44
464 ; CHECK-NEXT: vpsel q0, q0, q1
465 ; CHECK-NEXT: vaddv.u32 r12, q0
466 ; CHECK-NEXT: .LBB6_7:
467 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr}
468 ; CHECK-NEXT: mov r0, r12
471 %cmp35 = icmp eq i32 %N, 0
472 br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
474 vector.ph: ; preds = %entry
475 %n.rnd.up = add i32 %N, 3
476 %n.vec = and i32 %n.rnd.up, -4
477 br label %vector.body
479 vector.body: ; preds = %vector.body, %vector.ph
480 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
481 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
482 %i = getelementptr inbounds i8, ptr %a, i32 %index
483 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
484 %i1 = bitcast ptr %i to ptr
485 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
486 %i2 = zext <4 x i8> %wide.masked.load to <4 x i32>
487 %i3 = getelementptr inbounds i8, ptr %b, i32 %index
488 %i4 = bitcast ptr %i3 to ptr
489 %wide.masked.load43 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i4, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
490 %i5 = zext <4 x i8> %wide.masked.load43 to <4 x i32>
491 %i6 = mul nuw nsw <4 x i32> %i5, %i2
492 %i7 = add <4 x i32> %i6, %vec.phi
493 %index.next = add i32 %index, 4
494 %i8 = icmp eq i32 %index.next, %n.vec
495 br i1 %i8, label %middle.block, label %vector.body
497 middle.block: ; preds = %vector.body
498 %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi
499 %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9)
500 br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47
502 vector.ph47: ; preds = %middle.block
503 %n.rnd.up48 = add i32 %N, 3
504 %n.vec50 = and i32 %n.rnd.up48, -4
505 %i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
506 br label %vector.body46
508 vector.body46: ; preds = %vector.body46, %vector.ph47
509 %index51 = phi i32 [ 0, %vector.ph47 ], [ %index.next52, %vector.body46 ]
510 %vec.phi60 = phi <4 x i32> [ %i11, %vector.ph47 ], [ %i19, %vector.body46 ]
511 %i12 = getelementptr inbounds i8, ptr %a, i32 %index51
512 %active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %N)
513 %i13 = bitcast ptr %i12 to ptr
514 %wide.masked.load62 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i13, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
515 %i14 = zext <4 x i8> %wide.masked.load62 to <4 x i32>
516 %i15 = getelementptr inbounds i8, ptr %b, i32 %index51
517 %i16 = bitcast ptr %i15 to ptr
518 %wide.masked.load63 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i16, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
519 %i17 = zext <4 x i8> %wide.masked.load63 to <4 x i32>
520 %i18 = mul nuw nsw <4 x i32> %i17, %i14
521 %i19 = add <4 x i32> %i18, %vec.phi60
522 %index.next52 = add i32 %index51, 4
523 %i20 = icmp eq i32 %index.next52, %n.vec50
524 br i1 %i20, label %middle.block44, label %vector.body46
526 middle.block44: ; preds = %vector.body46
527 %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60
528 %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21)
529 br label %for.cond.cleanup7
531 for.cond.cleanup7: ; preds = %middle.block44, %middle.block, %entry
532 %res.1.lcssa = phi i32 [ %i10, %middle.block ], [ 0, %entry ], [ %i22, %middle.block44 ]
536 define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
537 ; CHECK-LABEL: two_reductions_mul_add_v8i16:
538 ; CHECK: @ %bb.0: @ %entry
539 ; CHECK-NEXT: push {r4, lr}
540 ; CHECK-NEXT: vpush {d8, d9}
541 ; CHECK-NEXT: cbz r2, .LBB7_4
542 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
543 ; CHECK-NEXT: adds r3, r2, #7
544 ; CHECK-NEXT: vmov.i32 q1, #0x0
545 ; CHECK-NEXT: bic r3, r3, #7
546 ; CHECK-NEXT: movs r4, #1
547 ; CHECK-NEXT: subs r3, #8
548 ; CHECK-NEXT: vmov q3, q1
549 ; CHECK-NEXT: add.w r12, r4, r3, lsr #3
550 ; CHECK-NEXT: mov r3, r0
551 ; CHECK-NEXT: mov r4, r1
552 ; CHECK-NEXT: dls lr, r12
553 ; CHECK-NEXT: .LBB7_2: @ %vector.body
554 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
555 ; CHECK-NEXT: vctp.16 r2
556 ; CHECK-NEXT: vmov q0, q1
558 ; CHECK-NEXT: vldrbt.u16 q1, [r3], #8
559 ; CHECK-NEXT: vldrbt.u16 q4, [r4], #8
560 ; CHECK-NEXT: vmov q2, q3
561 ; CHECK-NEXT: subs r2, #8
562 ; CHECK-NEXT: vsub.i16 q3, q4, q1
563 ; CHECK-NEXT: vmul.i16 q1, q4, q1
564 ; CHECK-NEXT: vadd.i16 q3, q3, q2
565 ; CHECK-NEXT: vadd.i16 q1, q1, q0
566 ; CHECK-NEXT: le lr, .LBB7_2
567 ; CHECK-NEXT: @ %bb.3: @ %middle.block
568 ; CHECK-NEXT: vpsel q2, q3, q2
569 ; CHECK-NEXT: vpsel q0, q1, q0
570 ; CHECK-NEXT: vaddv.u16 r4, q2
571 ; CHECK-NEXT: vaddv.u16 r2, q0
572 ; CHECK-NEXT: b .LBB7_5
573 ; CHECK-NEXT: .LBB7_4:
574 ; CHECK-NEXT: movs r2, #0
575 ; CHECK-NEXT: movs r4, #0
576 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
577 ; CHECK-NEXT: strb r2, [r0]
578 ; CHECK-NEXT: strb r4, [r1]
579 ; CHECK-NEXT: vpop {d8, d9}
580 ; CHECK-NEXT: pop {r4, pc}
582 %cmp12 = icmp eq i32 %N, 0
583 br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
585 vector.ph: ; preds = %entry
586 %n.rnd.up = add i32 %N, 7
587 %n.vec = and i32 %n.rnd.up, -8
588 br label %vector.body
590 vector.body: ; preds = %vector.body, %vector.ph
591 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
592 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i8, %vector.body ]
593 %vec.phi.1 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i9, %vector.body ]
594 %i = getelementptr inbounds i8, ptr %a, i32 %index
595 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
596 %i1 = bitcast ptr %i to ptr
597 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
598 %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
599 %i3 = getelementptr inbounds i8, ptr %b, i32 %index
600 %i4 = bitcast ptr %i3 to ptr
601 %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
602 %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
603 %i6 = mul <8 x i16> %i5, %i2
604 %i7 = sub <8 x i16> %i5, %i2
605 %i8 = add <8 x i16> %i6, %vec.phi
606 %i9 = add <8 x i16> %i7, %vec.phi.1
607 %index.next = add i32 %index, 8
608 %i10 = icmp eq i32 %index.next, %n.vec
609 br i1 %i10, label %middle.block, label %vector.body
611 middle.block: ; preds = %vector.body
612 %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi
613 %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11)
614 %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1
615 %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13)
616 br label %for.cond.cleanup
618 for.cond.cleanup: ; preds = %middle.block, %entry
619 %res.0.lcssa = phi i16 [ 0, %entry ], [ %i12, %middle.block ]
620 %res.1.lcssa = phi i16 [ 0, %entry ], [ %i14, %middle.block ]
621 %trunc.res.0 = trunc i16 %res.0.lcssa to i8
622 store i8 %trunc.res.0, ptr %a
623 %trunc.res.1 = trunc i16 %res.1.lcssa to i8
624 store i8 %trunc.res.1, ptr %b
628 %struct.date = type { i32, i32, i32, i32 }
629 @days = internal unnamed_addr constant [2 x [13 x i32]] [[13 x i32] [i32 0, i32 31, i32 28, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31], [13 x i32] [i32 0, i32 31, i32 29, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31]], align 4
630 define i32 @wrongop(ptr nocapture readonly %pd) {
631 ; CHECK-LABEL: wrongop:
632 ; CHECK: @ %bb.0: @ %entry
633 ; CHECK-NEXT: push {r4, lr}
634 ; CHECK-NEXT: mov r1, r0
635 ; CHECK-NEXT: movw r12, #47184
636 ; CHECK-NEXT: movw r3, #23593
637 ; CHECK-NEXT: ldrd r2, lr, [r1, #4]
638 ; CHECK-NEXT: movt r12, #1310
639 ; CHECK-NEXT: movt r3, #49807
640 ; CHECK-NEXT: mla r3, lr, r3, r12
641 ; CHECK-NEXT: movw r1, #55051
642 ; CHECK-NEXT: movw r4, #23593
643 ; CHECK-NEXT: movt r1, #163
644 ; CHECK-NEXT: ldr r0, [r0]
645 ; CHECK-NEXT: movt r4, #655
646 ; CHECK-NEXT: ror.w r12, r3, #4
647 ; CHECK-NEXT: cmp r12, r1
648 ; CHECK-NEXT: cset r1, lo
649 ; CHECK-NEXT: ror.w r3, r3, #2
650 ; CHECK-NEXT: mov.w r12, #1
651 ; CHECK-NEXT: cmp r3, r4
652 ; CHECK-NEXT: csel r3, r1, r12, lo
653 ; CHECK-NEXT: lsls.w r4, lr, #30
654 ; CHECK-NEXT: csel r1, r1, r3, ne
655 ; CHECK-NEXT: cmp r2, #1
657 ; CHECK-NEXT: poplt {r4, pc}
658 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
659 ; CHECK-NEXT: movw r3, :lower16:days
660 ; CHECK-NEXT: movs r4, #52
661 ; CHECK-NEXT: movt r3, :upper16:days
662 ; CHECK-NEXT: smlabb r1, r1, r4, r3
663 ; CHECK-NEXT: movs r3, #0
664 ; CHECK-NEXT: vdup.32 q0, r3
665 ; CHECK-NEXT: vmov.32 q0[0], r0
666 ; CHECK-NEXT: adds r0, r2, #3
667 ; CHECK-NEXT: bic r0, r0, #3
668 ; CHECK-NEXT: subs r0, #4
669 ; CHECK-NEXT: add.w r0, r12, r0, lsr #2
670 ; CHECK-NEXT: dls lr, r0
671 ; CHECK-NEXT: .LBB8_2: @ %vector.body
672 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
673 ; CHECK-NEXT: vctp.32 r2
674 ; CHECK-NEXT: vmov q1, q0
676 ; CHECK-NEXT: vldrwt.u32 q0, [r1], #16
677 ; CHECK-NEXT: subs r2, #4
678 ; CHECK-NEXT: vadd.i32 q0, q0, q1
679 ; CHECK-NEXT: le lr, .LBB8_2
680 ; CHECK-NEXT: @ %bb.3: @ %middle.block
681 ; CHECK-NEXT: vpsel q0, q0, q1
682 ; CHECK-NEXT: vaddv.u32 r0, q0
683 ; CHECK-NEXT: pop {r4, pc}
685 %day1 = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 0
686 %0 = load i32, ptr %day1, align 4
687 %year = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 2
688 %1 = load i32, ptr %year, align 4
690 %cmp = icmp ne i32 %2, 0
691 %rem3 = srem i32 %1, 100
692 %cmp4.not = icmp eq i32 %rem3, 0
693 %or.cond = or i1 %cmp, %cmp4.not
694 br i1 %or.cond, label %lor.rhs, label %lor.end
696 lor.rhs: ; preds = %entry
697 %rem6 = srem i32 %1, 400
698 %cmp7 = icmp eq i32 %rem6, 0
699 %phi.cast = zext i1 %cmp7 to i32
702 lor.end: ; preds = %entry, %lor.rhs
703 %3 = phi i32 [ %phi.cast, %lor.rhs ], [ 1, %entry ]
704 %month = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 1
705 %4 = load i32, ptr %month, align 4
706 %cmp820 = icmp sgt i32 %4, 0
707 br i1 %cmp820, label %vector.ph, label %for.end
709 vector.ph: ; preds = %lor.end
710 %n.rnd.up = add i32 %4, 3
711 %n.vec = and i32 %n.rnd.up, -4
712 %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
713 br label %vector.body
715 vector.body: ; preds = %vector.body, %vector.ph
716 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
717 %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %8, %vector.body ]
718 %6 = getelementptr inbounds [2 x [13 x i32]], ptr @days, i32 0, i32 %3, i32 %index
719 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %4)
720 %7 = bitcast ptr %6 to ptr
721 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %7, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
722 %8 = add <4 x i32> %wide.masked.load, %vec.phi
723 %index.next = add i32 %index, 4
724 %9 = icmp eq i32 %index.next, %n.vec
725 br i1 %9, label %middle.block, label %vector.body
727 middle.block: ; preds = %vector.body
728 %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi
729 %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
732 for.end: ; preds = %middle.block, %lor.end
733 %day.0.lcssa = phi i32 [ %0, %lor.end ], [ %11, %middle.block ]
737 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
738 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
739 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
740 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
741 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
742 declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32 immarg, <8 x i1>, <8 x i8>)
743 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
744 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
745 declare <4 x i8> @llvm.masked.load.v4i8.p0(ptr, i32 immarg, <4 x i1>, <4 x i8>)
746 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)