1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) {
5 ; CHECK-LABEL: gather_inc_mini_4i32:
7 ; CHECK-NEXT: movs r1, #4
8 ; CHECK-NEXT: vadd.i32 q1, q0, r1
9 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
11 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
12 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1
13 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
14 ret <4 x i32> %wide.masked.gather
17 define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) {
18 ; CHECK-LABEL: gather_inc_minipred_4i32:
20 ; CHECK-NEXT: movs r1, #4
21 ; CHECK-NEXT: movw r2, #3855
22 ; CHECK-NEXT: vadd.i32 q1, q0, r1
23 ; CHECK-NEXT: vmsr p0, r2
25 ; CHECK-NEXT: vldrwt.u32 q0, [r0, q1, uxtw #2]
27 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
28 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1
29 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
30 ret <4 x i32> %wide.masked.gather
33 define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <8 x i32> %offs) {
34 ; CHECK-LABEL: gather_inc_mini_8i16:
36 ; CHECK-NEXT: .save {r4, r5, r6, lr}
37 ; CHECK-NEXT: push {r4, r5, r6, lr}
38 ; CHECK-NEXT: vshl.i32 q1, q1, #1
39 ; CHECK-NEXT: mov.w r12, #16
40 ; CHECK-NEXT: vadd.i32 q1, q1, r0
41 ; CHECK-NEXT: vshl.i32 q0, q0, #1
42 ; CHECK-NEXT: vadd.i32 q1, q1, r12
43 ; CHECK-NEXT: vadd.i32 q0, q0, r0
44 ; CHECK-NEXT: vmov r1, lr, d3
45 ; CHECK-NEXT: vadd.i32 q0, q0, r12
46 ; CHECK-NEXT: vmov r0, r3, d1
47 ; CHECK-NEXT: vmov r2, r4, d2
48 ; CHECK-NEXT: ldrh r6, [r1]
49 ; CHECK-NEXT: vmov r1, r5, d0
50 ; CHECK-NEXT: ldrh r0, [r0]
51 ; CHECK-NEXT: ldrh r3, [r3]
52 ; CHECK-NEXT: ldrh r2, [r2]
53 ; CHECK-NEXT: ldrh r4, [r4]
54 ; CHECK-NEXT: ldrh.w r12, [lr]
55 ; CHECK-NEXT: ldrh r1, [r1]
56 ; CHECK-NEXT: ldrh r5, [r5]
57 ; CHECK-NEXT: vmov.16 q0[0], r1
58 ; CHECK-NEXT: vmov.16 q0[1], r5
59 ; CHECK-NEXT: vmov.16 q0[2], r0
60 ; CHECK-NEXT: vmov.16 q0[3], r3
61 ; CHECK-NEXT: vmov.16 q0[4], r2
62 ; CHECK-NEXT: vmov.16 q0[5], r4
63 ; CHECK-NEXT: vmov.16 q0[6], r6
64 ; CHECK-NEXT: vmov.16 q0[7], r12
65 ; CHECK-NEXT: pop {r4, r5, r6, pc}
66 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
67 %2 = getelementptr inbounds i16, ptr %data, <8 x i32> %1
68 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
69 ret <8 x i16> %wide.masked.gather
72 define arm_aapcs_vfpcc <8 x i16> @gather_inc_minipred_8i16(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <8 x i32> %offs) {
73 ; CHECK-LABEL: gather_inc_minipred_8i16:
75 ; CHECK-NEXT: vshl.i32 q0, q0, #1
76 ; CHECK-NEXT: movs r1, #16
77 ; CHECK-NEXT: vadd.i32 q0, q0, r0
78 ; CHECK-NEXT: vshl.i32 q1, q1, #1
79 ; CHECK-NEXT: vadd.i32 q0, q0, r1
80 ; CHECK-NEXT: vadd.i32 q1, q1, r0
81 ; CHECK-NEXT: vmov r2, s0
82 ; CHECK-NEXT: vadd.i32 q1, q1, r1
83 ; CHECK-NEXT: vmov r3, s2
84 ; CHECK-NEXT: vmov r0, s4
85 ; CHECK-NEXT: vmov r1, s6
86 ; CHECK-NEXT: ldrh r2, [r2]
87 ; CHECK-NEXT: ldrh r3, [r3]
88 ; CHECK-NEXT: vmov.16 q0[0], r2
89 ; CHECK-NEXT: ldrh r0, [r0]
90 ; CHECK-NEXT: vmov.16 q0[2], r3
91 ; CHECK-NEXT: ldrh r1, [r1]
92 ; CHECK-NEXT: vmov.16 q0[4], r0
93 ; CHECK-NEXT: vmov.16 q0[6], r1
95 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
96 %2 = getelementptr inbounds i16, ptr %data, <8 x i32> %1
97 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> undef)
98 ret <8 x i16> %wide.masked.gather
101 define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <16 x i32> %offs) {
102 ; CHECK-LABEL: gather_inc_mini_16i8:
104 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
105 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
106 ; CHECK-NEXT: movs r5, #16
107 ; CHECK-NEXT: vadd.i32 q3, q3, r0
108 ; CHECK-NEXT: vadd.i32 q3, q3, r5
109 ; CHECK-NEXT: vadd.i32 q0, q0, r0
110 ; CHECK-NEXT: vmov r1, r2, d7
111 ; CHECK-NEXT: vadd.i32 q1, q1, r0
112 ; CHECK-NEXT: vmov r3, r4, d6
113 ; CHECK-NEXT: vadd.i32 q3, q0, r5
114 ; CHECK-NEXT: vadd.i32 q0, q2, r0
115 ; CHECK-NEXT: vadd.i32 q1, q1, r5
116 ; CHECK-NEXT: vadd.i32 q2, q0, r5
117 ; CHECK-NEXT: ldrb.w r12, [r1]
118 ; CHECK-NEXT: ldrb r1, [r3]
119 ; CHECK-NEXT: ldrb.w lr, [r2]
120 ; CHECK-NEXT: ldrb r3, [r4]
121 ; CHECK-NEXT: vmov r2, r4, d6
122 ; CHECK-NEXT: ldrb r2, [r2]
123 ; CHECK-NEXT: ldrb r4, [r4]
124 ; CHECK-NEXT: vmov.8 q0[0], r2
125 ; CHECK-NEXT: vmov r2, r6, d5
126 ; CHECK-NEXT: vmov.8 q0[1], r4
127 ; CHECK-NEXT: ldrb r4, [r2]
128 ; CHECK-NEXT: ldrb r2, [r6]
129 ; CHECK-NEXT: vmov r6, r7, d7
130 ; CHECK-NEXT: ldrb r0, [r6]
131 ; CHECK-NEXT: ldrb r7, [r7]
132 ; CHECK-NEXT: vmov.8 q0[2], r0
133 ; CHECK-NEXT: vmov r0, r5, d2
134 ; CHECK-NEXT: vmov.8 q0[3], r7
135 ; CHECK-NEXT: ldrb r0, [r0]
136 ; CHECK-NEXT: ldrb r5, [r5]
137 ; CHECK-NEXT: vmov.8 q0[4], r0
138 ; CHECK-NEXT: vmov.8 q0[5], r5
139 ; CHECK-NEXT: vmov r0, r5, d3
140 ; CHECK-NEXT: ldrb r0, [r0]
141 ; CHECK-NEXT: ldrb r5, [r5]
142 ; CHECK-NEXT: vmov.8 q0[6], r0
143 ; CHECK-NEXT: vmov.8 q0[7], r5
144 ; CHECK-NEXT: vmov r0, r5, d4
145 ; CHECK-NEXT: ldrb r0, [r0]
146 ; CHECK-NEXT: ldrb r5, [r5]
147 ; CHECK-NEXT: vmov.8 q0[8], r0
148 ; CHECK-NEXT: vmov.8 q0[9], r5
149 ; CHECK-NEXT: vmov.8 q0[10], r4
150 ; CHECK-NEXT: vmov.8 q0[11], r2
151 ; CHECK-NEXT: vmov.8 q0[12], r1
152 ; CHECK-NEXT: vmov.8 q0[13], r3
153 ; CHECK-NEXT: vmov.8 q0[14], r12
154 ; CHECK-NEXT: vmov.8 q0[15], lr
155 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
156 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
157 %2 = getelementptr inbounds i8, ptr %data, <16 x i32> %1
158 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
159 ret <16 x i8> %wide.masked.gather
162 define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <16 x i32> %offs) {
163 ; CHECK-LABEL: gather_inc_minipred_16i8:
165 ; CHECK-NEXT: .save {r4, r5, r7, lr}
166 ; CHECK-NEXT: push {r4, r5, r7, lr}
167 ; CHECK-NEXT: movs r1, #16
168 ; CHECK-NEXT: vadd.i32 q1, q1, r0
169 ; CHECK-NEXT: vadd.i32 q1, q1, r1
170 ; CHECK-NEXT: vadd.i32 q2, q2, r0
171 ; CHECK-NEXT: vmov r2, s4
172 ; CHECK-NEXT: vadd.i32 q2, q2, r1
173 ; CHECK-NEXT: vadd.i32 q0, q0, r0
174 ; CHECK-NEXT: vmov r3, s10
175 ; CHECK-NEXT: vadd.i32 q0, q0, r1
176 ; CHECK-NEXT: vmov r4, s0
177 ; CHECK-NEXT: vmov r5, s2
178 ; CHECK-NEXT: ldrb.w r12, [r2]
179 ; CHECK-NEXT: vmov r2, s8
180 ; CHECK-NEXT: ldrb r3, [r3]
181 ; CHECK-NEXT: ldrb r4, [r4]
182 ; CHECK-NEXT: ldrb r5, [r5]
183 ; CHECK-NEXT: vmov.8 q0[0], r4
184 ; CHECK-NEXT: vmov.8 q0[2], r5
185 ; CHECK-NEXT: vmov.8 q0[4], r12
186 ; CHECK-NEXT: ldrb.w lr, [r2]
187 ; CHECK-NEXT: vmov r2, s6
188 ; CHECK-NEXT: vadd.i32 q1, q3, r0
189 ; CHECK-NEXT: vadd.i32 q1, q1, r1
190 ; CHECK-NEXT: vmov r0, s4
191 ; CHECK-NEXT: vmov r1, s6
192 ; CHECK-NEXT: ldrb r2, [r2]
193 ; CHECK-NEXT: vmov.8 q0[6], r2
194 ; CHECK-NEXT: vmov.8 q0[8], lr
195 ; CHECK-NEXT: ldrb r0, [r0]
196 ; CHECK-NEXT: vmov.8 q0[10], r3
197 ; CHECK-NEXT: ldrb r1, [r1]
198 ; CHECK-NEXT: vmov.8 q0[12], r0
199 ; CHECK-NEXT: vmov.8 q0[14], r1
200 ; CHECK-NEXT: pop {r4, r5, r7, pc}
201 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
202 %2 = getelementptr inbounds i8, ptr %data, <16 x i32> %1
203 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
204 ret <16 x i8> %wide.masked.gather
207 define arm_aapcs_vfpcc void @gather_pre_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) {
208 ; CHECK-LABEL: gather_pre_inc:
209 ; CHECK: @ %bb.0: @ %vector.ph
210 ; CHECK-NEXT: adr r3, .LCPI6_0
211 ; CHECK-NEXT: vldrw.u32 q0, [r3]
212 ; CHECK-NEXT: vadd.i32 q0, q0, r0
213 ; CHECK-NEXT: .LBB6_1: @ %vector.body
214 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
215 ; CHECK-NEXT: vldrw.u32 q1, [q0, #96]!
216 ; CHECK-NEXT: subs r2, #4
217 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
218 ; CHECK-NEXT: bne .LBB6_1
219 ; CHECK-NEXT: @ %bb.2: @ %end
221 ; CHECK-NEXT: .p2align 4
222 ; CHECK-NEXT: @ %bb.3:
223 ; CHECK-NEXT: .LCPI6_0:
224 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
225 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
226 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
227 ; CHECK-NEXT: .long 0 @ 0x0
228 vector.ph: ; preds = %for.body.preheader
229 br label %vector.body
231 vector.body: ; preds = %vector.body, %vector.ph
232 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
233 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
234 %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
235 %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
236 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1
237 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
238 %3 = getelementptr inbounds i32, ptr %dst, i32 %index
239 store <4 x i32> %wide.masked.gather, ptr %3, align 4
240 %index.next = add i32 %index, 4
241 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
242 %4 = icmp eq i32 %index.next, %n.vec
243 br i1 %4, label %end, label %vector.body
249 define arm_aapcs_vfpcc void @gather_post_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec43) {
250 ; CHECK-LABEL: gather_post_inc:
251 ; CHECK: @ %bb.0: @ %vector.ph41
252 ; CHECK-NEXT: adr r3, .LCPI7_0
253 ; CHECK-NEXT: vldrw.u32 q0, [r3]
254 ; CHECK-NEXT: vadd.i32 q0, q0, r0
255 ; CHECK-NEXT: .LBB7_1: @ %vector.body39
256 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
257 ; CHECK-NEXT: vldrw.u32 q1, [q0, #96]!
258 ; CHECK-NEXT: subs r2, #4
259 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
260 ; CHECK-NEXT: bne .LBB7_1
261 ; CHECK-NEXT: @ %bb.2: @ %end
263 ; CHECK-NEXT: .p2align 4
264 ; CHECK-NEXT: @ %bb.3:
265 ; CHECK-NEXT: .LCPI7_0:
266 ; CHECK-NEXT: .long 4294967200 @ 0xffffffa0
267 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
268 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
269 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
270 vector.ph41: ; preds = %for.body6.preheader
271 br label %vector.body39
273 vector.body39: ; preds = %vector.body39, %vector.ph41
274 %index44 = phi i32 [ 0, %vector.ph41 ], [ %index.next45, %vector.body39 ]
275 %vec.ind50 = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph41 ], [ %vec.ind.next51, %vector.body39 ]
276 %0 = mul nuw nsw <4 x i32> %vec.ind50, <i32 3, i32 3, i32 3, i32 3>
277 %1 = getelementptr inbounds i32, ptr %data, <4 x i32> %0
278 %wide.masked.gather55 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
279 %2 = getelementptr inbounds i32, ptr %dst, i32 %index44
280 store <4 x i32> %wide.masked.gather55, ptr %2, align 4
281 %index.next45 = add i32 %index44, 4
282 %vec.ind.next51 = add <4 x i32> %vec.ind50, <i32 8, i32 8, i32 8, i32 8>
283 %3 = icmp eq i32 %index.next45, %n.vec43
284 br i1 %3, label %end, label %vector.body39
290 define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
291 ; CHECK-LABEL: gather_inc_v4i32_simple:
292 ; CHECK: @ %bb.0: @ %entry
293 ; CHECK-NEXT: cmp r2, #1
295 ; CHECK-NEXT: bxlt lr
296 ; CHECK-NEXT: .LBB8_1: @ %vector.ph.preheader
297 ; CHECK-NEXT: .save {r4, lr}
298 ; CHECK-NEXT: push {r4, lr}
299 ; CHECK-NEXT: bic r12, r2, #3
300 ; CHECK-NEXT: movs r3, #1
301 ; CHECK-NEXT: sub.w lr, r12, #4
302 ; CHECK-NEXT: add.w r4, r3, lr, lsr #2
303 ; CHECK-NEXT: adr r3, .LCPI8_0
304 ; CHECK-NEXT: vldrw.u32 q0, [r3]
305 ; CHECK-NEXT: vadd.i32 q0, q0, r0
306 ; CHECK-NEXT: .LBB8_2: @ %vector.ph
307 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
308 ; CHECK-NEXT: @ Child Loop BB8_3 Depth 2
309 ; CHECK-NEXT: dls lr, r4
310 ; CHECK-NEXT: mov r0, r1
311 ; CHECK-NEXT: vmov q1, q0
312 ; CHECK-NEXT: .LBB8_3: @ %vector.body
313 ; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1
314 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
315 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]!
316 ; CHECK-NEXT: vstrb.8 q2, [r0], #16
317 ; CHECK-NEXT: le lr, .LBB8_3
318 ; CHECK-NEXT: @ %bb.4: @ %middle.block
319 ; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1
320 ; CHECK-NEXT: cmp r12, r2
321 ; CHECK-NEXT: bne .LBB8_2
322 ; CHECK-NEXT: @ %bb.5:
323 ; CHECK-NEXT: pop.w {r4, lr}
325 ; CHECK-NEXT: .p2align 4
326 ; CHECK-NEXT: @ %bb.6:
327 ; CHECK-NEXT: .LCPI8_0:
328 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
329 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4
330 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
331 ; CHECK-NEXT: .long 4294967292 @ 0xfffffffc
333 %cmp22 = icmp sgt i32 %n, 0
334 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
336 vector.ph: ; preds = %for.body.preheader
337 %n.vec = and i32 %n, -4
338 br label %vector.body
340 vector.body: ; preds = %vector.body, %vector.ph
341 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
342 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
343 %0 = getelementptr inbounds i32, ptr %data, <4 x i32> %vec.ind
344 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
345 %1 = getelementptr inbounds i32, ptr %dst, i32 %index
346 store <4 x i32> %wide.masked.gather, ptr %1, align 4
347 %index.next = add i32 %index, 4
348 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
349 %2 = icmp eq i32 %index.next, %n.vec
350 br i1 %2, label %middle.block, label %vector.body
352 middle.block: ; preds = %vector.body
353 %cmp.n = icmp eq i32 %n.vec, %n
354 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
356 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
360 define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
361 ; CHECK-LABEL: gather_inc_v4i32_complex:
362 ; CHECK: @ %bb.0: @ %entry
363 ; CHECK-NEXT: cmp r2, #1
365 ; CHECK-NEXT: bxlt lr
366 ; CHECK-NEXT: .LBB9_1: @ %vector.ph.preheader
367 ; CHECK-NEXT: .save {r4, r5, r7, lr}
368 ; CHECK-NEXT: push {r4, r5, r7, lr}
369 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
370 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
371 ; CHECK-NEXT: bic r12, r2, #3
372 ; CHECK-NEXT: movs r3, #1
373 ; CHECK-NEXT: sub.w lr, r12, #4
374 ; CHECK-NEXT: adr r4, .LCPI9_1
375 ; CHECK-NEXT: adr r5, .LCPI9_2
376 ; CHECK-NEXT: vldrw.u32 q1, [r4]
377 ; CHECK-NEXT: add.w r3, r3, lr, lsr #2
378 ; CHECK-NEXT: adr.w lr, .LCPI9_0
379 ; CHECK-NEXT: vldrw.u32 q0, [r5]
380 ; CHECK-NEXT: vldrw.u32 q2, [lr]
381 ; CHECK-NEXT: vadd.i32 q1, q1, r0
382 ; CHECK-NEXT: vadd.i32 q0, q0, r0
383 ; CHECK-NEXT: vadd.i32 q2, q2, r0
384 ; CHECK-NEXT: .LBB9_2: @ %vector.ph
385 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
386 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 2
387 ; CHECK-NEXT: dls lr, r3
388 ; CHECK-NEXT: mov r0, r1
389 ; CHECK-NEXT: vmov q3, q1
390 ; CHECK-NEXT: vmov q4, q0
391 ; CHECK-NEXT: vmov q5, q2
392 ; CHECK-NEXT: .LBB9_3: @ %vector.body
393 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1
394 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
395 ; CHECK-NEXT: vldrw.u32 q6, [q5, #48]!
396 ; CHECK-NEXT: vldrw.u32 q7, [q3, #48]!
397 ; CHECK-NEXT: vadd.i32 q6, q7, q6
398 ; CHECK-NEXT: vldrw.u32 q7, [q4, #48]!
399 ; CHECK-NEXT: vadd.i32 q6, q6, q7
400 ; CHECK-NEXT: vstrb.8 q6, [r0], #16
401 ; CHECK-NEXT: le lr, .LBB9_3
402 ; CHECK-NEXT: @ %bb.4: @ %middle.block
403 ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1
404 ; CHECK-NEXT: cmp r12, r2
405 ; CHECK-NEXT: bne .LBB9_2
406 ; CHECK-NEXT: @ %bb.5:
407 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
408 ; CHECK-NEXT: pop.w {r4, r5, r7, lr}
410 ; CHECK-NEXT: .p2align 4
411 ; CHECK-NEXT: @ %bb.6:
412 ; CHECK-NEXT: .LCPI9_0:
413 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
414 ; CHECK-NEXT: .long 4294967260 @ 0xffffffdc
415 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
416 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4
417 ; CHECK-NEXT: .LCPI9_1:
418 ; CHECK-NEXT: .long 4294967252 @ 0xffffffd4
419 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
420 ; CHECK-NEXT: .long 4294967276 @ 0xffffffec
421 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
422 ; CHECK-NEXT: .LCPI9_2:
423 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
424 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4
425 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
426 ; CHECK-NEXT: .long 4294967292 @ 0xfffffffc
428 %cmp22 = icmp sgt i32 %n, 0
429 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
431 vector.ph: ; preds = %for.body.preheader
432 %n.vec = and i32 %n, -4
433 br label %vector.body
435 vector.body: ; preds = %vector.body, %vector.ph
436 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
437 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
438 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
439 %1 = getelementptr inbounds i32, ptr %data, <4 x i32> %0
440 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
441 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
442 %3 = getelementptr inbounds i32, ptr %data, <4 x i32> %2
443 %wide.masked.gather24 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
444 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
445 %5 = getelementptr inbounds i32, ptr %data, <4 x i32> %4
446 %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
447 %6 = add nsw <4 x i32> %wide.masked.gather24, %wide.masked.gather
448 %7 = add nsw <4 x i32> %6, %wide.masked.gather25
449 %8 = getelementptr inbounds i32, ptr %dst, i32 %index
450 store <4 x i32> %7, ptr %8, align 4
451 %index.next = add i32 %index, 4
452 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
453 %9 = icmp eq i32 %index.next, %n.vec
454 br i1 %9, label %middle.block, label %vector.body
456 middle.block: ; preds = %vector.body
457 %cmp.n = icmp eq i32 %n.vec, %n
458 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
460 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
464 define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
465 ; CHECK-LABEL: gather_inc_v4i32_large:
466 ; CHECK: @ %bb.0: @ %entry
467 ; CHECK-NEXT: cmp r2, #1
469 ; CHECK-NEXT: bxlt lr
470 ; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader
471 ; CHECK-NEXT: .save {r4, lr}
472 ; CHECK-NEXT: push {r4, lr}
473 ; CHECK-NEXT: bic r12, r2, #3
474 ; CHECK-NEXT: movs r3, #1
475 ; CHECK-NEXT: sub.w lr, r12, #4
476 ; CHECK-NEXT: add.w r4, r3, lr, lsr #2
477 ; CHECK-NEXT: adr r3, .LCPI10_0
478 ; CHECK-NEXT: vldrw.u32 q0, [r3]
479 ; CHECK-NEXT: vadd.i32 q0, q0, r0
480 ; CHECK-NEXT: .LBB10_2: @ %vector.ph
481 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
482 ; CHECK-NEXT: @ Child Loop BB10_3 Depth 2
483 ; CHECK-NEXT: dls lr, r4
484 ; CHECK-NEXT: mov r0, r1
485 ; CHECK-NEXT: vmov q1, q0
486 ; CHECK-NEXT: .LBB10_3: @ %vector.body
487 ; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1
488 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
489 ; CHECK-NEXT: vldrw.u32 q2, [q1, #508]!
490 ; CHECK-NEXT: vstrb.8 q2, [r0], #16
491 ; CHECK-NEXT: le lr, .LBB10_3
492 ; CHECK-NEXT: @ %bb.4: @ %middle.block
493 ; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1
494 ; CHECK-NEXT: cmp r12, r2
495 ; CHECK-NEXT: bne .LBB10_2
496 ; CHECK-NEXT: @ %bb.5:
497 ; CHECK-NEXT: pop.w {r4, lr}
499 ; CHECK-NEXT: .p2align 4
500 ; CHECK-NEXT: @ %bb.6:
501 ; CHECK-NEXT: .LCPI10_0:
502 ; CHECK-NEXT: .long 4294966788 @ 0xfffffe04
503 ; CHECK-NEXT: .long 4294966792 @ 0xfffffe08
504 ; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c
505 ; CHECK-NEXT: .long 4294966800 @ 0xfffffe10
507 %cmp22 = icmp sgt i32 %n, 0
508 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
510 vector.ph: ; preds = %for.body.preheader
511 %n.vec = and i32 %n, -4
512 br label %vector.body
514 vector.body: ; preds = %vector.body, %vector.ph
515 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
516 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
517 %0 = getelementptr inbounds i32, ptr %data, <4 x i32> %vec.ind
518 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
519 %1 = getelementptr inbounds i32, ptr %dst, i32 %index
520 store <4 x i32> %wide.masked.gather, ptr %1, align 4
521 %index.next = add i32 %index, 4
522 %vec.ind.next = add <4 x i32> %vec.ind, <i32 127, i32 127, i32 127, i32 127>
523 %2 = icmp eq i32 %index.next, %n.vec
524 br i1 %2, label %middle.block, label %vector.body
526 middle.block: ; preds = %vector.body
527 %cmp.n = icmp eq i32 %n.vec, %n
528 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
530 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
534 ; TODO: uneven - I think it's not possible to create such an example, because vec.ind will always be increased by a vector with 4 elements (=> x*4 = even)
536 ; TODO: What is sxth?
537 define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
538 ; CHECK-LABEL: gather_inc_v8i16_simple:
539 ; CHECK: @ %bb.0: @ %entry
540 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
541 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
542 ; CHECK-NEXT: .pad #28
543 ; CHECK-NEXT: sub sp, #28
544 ; CHECK-NEXT: cmp r2, #1
545 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
546 ; CHECK-NEXT: mov r1, r2
547 ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
548 ; CHECK-NEXT: blt .LBB11_5
549 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
550 ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
551 ; CHECK-NEXT: movs r6, #1
552 ; CHECK-NEXT: add r2, sp, #12
553 ; CHECK-NEXT: mov.w r9, #8
554 ; CHECK-NEXT: bic r1, r1, #7
555 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
556 ; CHECK-NEXT: sub.w r3, r1, #8
557 ; CHECK-NEXT: add.w r8, r6, r3, lsr #3
558 ; CHECK-NEXT: adr r3, .LCPI11_0
559 ; CHECK-NEXT: vldrw.u32 q0, [r3]
560 ; CHECK-NEXT: .LBB11_2: @ %vector.ph
561 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
562 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2
563 ; CHECK-NEXT: dls lr, r8
564 ; CHECK-NEXT: vmov q1, q0
565 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
566 ; CHECK-NEXT: .LBB11_3: @ %vector.body
567 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
568 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
569 ; CHECK-NEXT: vstrw.32 q1, [r2]
570 ; CHECK-NEXT: mov r12, r2
571 ; CHECK-NEXT: vldrh.s32 q2, [r2, #8]
572 ; CHECK-NEXT: vadd.i16 q1, q1, r9
573 ; CHECK-NEXT: vshl.i32 q2, q2, #1
574 ; CHECK-NEXT: vadd.i32 q2, q2, r0
575 ; CHECK-NEXT: vmov r7, r5, d5
576 ; CHECK-NEXT: vmov r3, r4, d4
577 ; CHECK-NEXT: vldrh.s32 q2, [r2]
578 ; CHECK-NEXT: vshl.i32 q2, q2, #1
579 ; CHECK-NEXT: vadd.i32 q2, q2, r0
580 ; CHECK-NEXT: vmov r1, r10, d5
581 ; CHECK-NEXT: ldrh r7, [r7]
582 ; CHECK-NEXT: ldrh r4, [r4]
583 ; CHECK-NEXT: ldrh r5, [r5]
584 ; CHECK-NEXT: ldrh.w r2, [r10]
585 ; CHECK-NEXT: ldrh.w r10, [r3]
586 ; CHECK-NEXT: vmov r3, r11, d4
587 ; CHECK-NEXT: ldrh r1, [r1]
588 ; CHECK-NEXT: ldrh r3, [r3]
589 ; CHECK-NEXT: ldrh.w r11, [r11]
590 ; CHECK-NEXT: vmov.16 q2[0], r3
591 ; CHECK-NEXT: vmov.16 q2[1], r11
592 ; CHECK-NEXT: vmov.16 q2[2], r1
593 ; CHECK-NEXT: vmov.16 q2[3], r2
594 ; CHECK-NEXT: mov r2, r12
595 ; CHECK-NEXT: vmov.16 q2[4], r10
596 ; CHECK-NEXT: vmov.16 q2[5], r4
597 ; CHECK-NEXT: vmov.16 q2[6], r7
598 ; CHECK-NEXT: vmov.16 q2[7], r5
599 ; CHECK-NEXT: vstrb.8 q2, [r6], #16
600 ; CHECK-NEXT: le lr, .LBB11_3
601 ; CHECK-NEXT: @ %bb.4: @ %middle.block
602 ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1
603 ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
604 ; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
605 ; CHECK-NEXT: cmp r3, r1
606 ; CHECK-NEXT: bne .LBB11_2
607 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup
608 ; CHECK-NEXT: add sp, #28
609 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
610 ; CHECK-NEXT: .p2align 4
611 ; CHECK-NEXT: @ %bb.6:
612 ; CHECK-NEXT: .LCPI11_0:
613 ; CHECK-NEXT: .short 0 @ 0x0
614 ; CHECK-NEXT: .short 1 @ 0x1
615 ; CHECK-NEXT: .short 2 @ 0x2
616 ; CHECK-NEXT: .short 3 @ 0x3
617 ; CHECK-NEXT: .short 4 @ 0x4
618 ; CHECK-NEXT: .short 5 @ 0x5
619 ; CHECK-NEXT: .short 6 @ 0x6
620 ; CHECK-NEXT: .short 7 @ 0x7
624 %cmp22 = icmp sgt i32 %n, 0
625 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
627 vector.ph: ; preds = %for.body.preheader
628 %n.vec = and i32 %n, -8
629 br label %vector.body
631 vector.body: ; preds = %vector.body, %vector.ph
632 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
633 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
634 %0 = getelementptr inbounds i16, ptr %data, <8 x i16> %vec.ind
635 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %0, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
636 %1 = getelementptr inbounds i16, ptr %dst, i32 %index
637 store <8 x i16> %wide.masked.gather, ptr %1, align 2
638 %index.next = add i32 %index, 8
639 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
640 %2 = icmp eq i32 %index.next, %n.vec
641 br i1 %2, label %middle.block, label %vector.body
643 middle.block: ; preds = %vector.body
644 %cmp.n = icmp eq i32 %n.vec, %n
645 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
647 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
651 ; TODO: This looks absolutely terrifying :(
652 define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
653 ; CHECK-LABEL: gather_inc_v8i16_complex:
654 ; CHECK: @ %bb.0: @ %entry
655 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
656 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
657 ; CHECK-NEXT: .pad #4
658 ; CHECK-NEXT: sub sp, #4
659 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
660 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
661 ; CHECK-NEXT: .pad #136
662 ; CHECK-NEXT: sub sp, #136
663 ; CHECK-NEXT: cmp r2, #1
664 ; CHECK-NEXT: str r1, [sp, #64] @ 4-byte Spill
665 ; CHECK-NEXT: mov r1, r2
666 ; CHECK-NEXT: str r2, [sp, #68] @ 4-byte Spill
667 ; CHECK-NEXT: blt.w .LBB12_5
668 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
669 ; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload
670 ; CHECK-NEXT: adr r3, .LCPI12_2
671 ; CHECK-NEXT: vldrw.u32 q0, [r3]
672 ; CHECK-NEXT: movs r2, #1
673 ; CHECK-NEXT: bic r1, r1, #7
674 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
675 ; CHECK-NEXT: subs r1, #8
676 ; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
677 ; CHECK-NEXT: vmov.i16 q2, #0x18
678 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3
679 ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
680 ; CHECK-NEXT: adr r1, .LCPI12_0
681 ; CHECK-NEXT: adr r2, .LCPI12_1
682 ; CHECK-NEXT: vldrw.u32 q0, [r1]
683 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
684 ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
685 ; CHECK-NEXT: vldrw.u32 q0, [r2]
686 ; CHECK-NEXT: add r2, sp, #120
687 ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
688 ; CHECK-NEXT: .LBB12_2: @ %vector.ph
689 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
690 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2
691 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
692 ; CHECK-NEXT: add.w r10, sp, #104
693 ; CHECK-NEXT: dls lr, r1
694 ; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
695 ; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
696 ; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload
697 ; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload
698 ; CHECK-NEXT: .LBB12_3: @ %vector.body
699 ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1
700 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
701 ; CHECK-NEXT: vstrw.32 q5, [r2]
702 ; CHECK-NEXT: mov r8, r2
703 ; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
704 ; CHECK-NEXT: vshl.i32 q0, q0, #1
705 ; CHECK-NEXT: vadd.i32 q0, q0, r0
706 ; CHECK-NEXT: vmov r1, r3, d0
707 ; CHECK-NEXT: vmov r4, r5, d1
708 ; CHECK-NEXT: vldrh.s32 q0, [r2]
709 ; CHECK-NEXT: vshl.i32 q0, q0, #1
710 ; CHECK-NEXT: vadd.i32 q2, q0, r0
711 ; CHECK-NEXT: vmov r6, r2, d4
712 ; CHECK-NEXT: ldrh r1, [r1]
713 ; CHECK-NEXT: ldrh.w r12, [r4]
714 ; CHECK-NEXT: add r4, sp, #88
715 ; CHECK-NEXT: ldrh.w r11, [r5]
716 ; CHECK-NEXT: ldrh r3, [r3]
717 ; CHECK-NEXT: ldrh r5, [r6]
718 ; CHECK-NEXT: ldrh r2, [r2]
719 ; CHECK-NEXT: vstrw.32 q6, [r4]
720 ; CHECK-NEXT: vldrh.s32 q0, [r4]
721 ; CHECK-NEXT: vmov.16 q7[0], r5
722 ; CHECK-NEXT: vmov.16 q7[1], r2
723 ; CHECK-NEXT: vshl.i32 q0, q0, #1
724 ; CHECK-NEXT: vadd.i32 q0, q0, r0
725 ; CHECK-NEXT: vmov r6, r9, d0
726 ; CHECK-NEXT: vmov r2, r5, d1
727 ; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
728 ; CHECK-NEXT: vshl.i32 q0, q0, #1
729 ; CHECK-NEXT: vadd.i32 q0, q0, r0
730 ; CHECK-NEXT: ldrh r6, [r6]
731 ; CHECK-NEXT: ldrh r2, [r2]
732 ; CHECK-NEXT: vmov.16 q1[0], r6
733 ; CHECK-NEXT: ldrh.w r6, [r9]
734 ; CHECK-NEXT: ldrh r5, [r5]
735 ; CHECK-NEXT: vmov.16 q1[1], r6
736 ; CHECK-NEXT: vmov.16 q1[2], r2
737 ; CHECK-NEXT: vmov r2, r6, d0
738 ; CHECK-NEXT: vmov.16 q1[3], r5
739 ; CHECK-NEXT: ldrh r2, [r2]
740 ; CHECK-NEXT: ldrh r6, [r6]
741 ; CHECK-NEXT: vmov.16 q1[4], r2
742 ; CHECK-NEXT: vmov r2, r5, d1
743 ; CHECK-NEXT: vmov.16 q1[5], r6
744 ; CHECK-NEXT: mov r6, r10
745 ; CHECK-NEXT: ldrh r2, [r2]
746 ; CHECK-NEXT: ldrh r5, [r5]
747 ; CHECK-NEXT: vstrw.32 q4, [r10]
748 ; CHECK-NEXT: vldrh.s32 q0, [r6]
749 ; CHECK-NEXT: vmov.16 q1[6], r2
750 ; CHECK-NEXT: vmov.16 q1[7], r5
751 ; CHECK-NEXT: vshl.i32 q0, q0, #1
752 ; CHECK-NEXT: vadd.i32 q0, q0, r0
753 ; CHECK-NEXT: vmov r2, r5, d0
754 ; CHECK-NEXT: ldrh r2, [r2]
755 ; CHECK-NEXT: ldrh r5, [r5]
756 ; CHECK-NEXT: vmov.16 q3[0], r2
757 ; CHECK-NEXT: vmov.16 q3[1], r5
758 ; CHECK-NEXT: vmov r2, r5, d5
759 ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
760 ; CHECK-NEXT: vadd.i16 q6, q6, q2
761 ; CHECK-NEXT: vadd.i16 q5, q5, q2
762 ; CHECK-NEXT: vadd.i16 q4, q4, q2
763 ; CHECK-NEXT: ldrh.w r9, [r2]
764 ; CHECK-NEXT: vmov r2, r4, d1
765 ; CHECK-NEXT: vldrh.s32 q0, [r6, #8]
766 ; CHECK-NEXT: ldrh r5, [r5]
767 ; CHECK-NEXT: vmov.16 q7[2], r9
768 ; CHECK-NEXT: vshl.i32 q0, q0, #1
769 ; CHECK-NEXT: vmov.16 q7[3], r5
770 ; CHECK-NEXT: vadd.i32 q0, q0, r0
771 ; CHECK-NEXT: vmov.16 q7[4], r1
772 ; CHECK-NEXT: vmov.16 q7[5], r3
773 ; CHECK-NEXT: vmov.16 q7[6], r12
774 ; CHECK-NEXT: vmov.16 q7[7], r11
775 ; CHECK-NEXT: ldrh r2, [r2]
776 ; CHECK-NEXT: ldrh r4, [r4]
777 ; CHECK-NEXT: vmov.16 q3[2], r2
778 ; CHECK-NEXT: vmov.16 q3[3], r4
779 ; CHECK-NEXT: vmov r2, r4, d0
780 ; CHECK-NEXT: ldrh r2, [r2]
781 ; CHECK-NEXT: ldrh r4, [r4]
782 ; CHECK-NEXT: vmov.16 q3[4], r2
783 ; CHECK-NEXT: vmov.16 q3[5], r4
784 ; CHECK-NEXT: vmov r2, r4, d1
785 ; CHECK-NEXT: ldrh r2, [r2]
786 ; CHECK-NEXT: ldrh r4, [r4]
787 ; CHECK-NEXT: vmov.16 q3[6], r2
788 ; CHECK-NEXT: mov r2, r8
789 ; CHECK-NEXT: vmov.16 q3[7], r4
790 ; CHECK-NEXT: vadd.i16 q0, q3, q1
791 ; CHECK-NEXT: vadd.i16 q0, q0, q7
792 ; CHECK-NEXT: vstrb.8 q0, [r7], #16
793 ; CHECK-NEXT: le lr, .LBB12_3
794 ; CHECK-NEXT: @ %bb.4: @ %middle.block
795 ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1
796 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
797 ; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload
798 ; CHECK-NEXT: cmp r1, r3
799 ; CHECK-NEXT: bne.w .LBB12_2
800 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup
801 ; CHECK-NEXT: add sp, #136
802 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
803 ; CHECK-NEXT: add sp, #4
804 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
805 ; CHECK-NEXT: .p2align 4
806 ; CHECK-NEXT: @ %bb.6:
807 ; CHECK-NEXT: .LCPI12_0:
808 ; CHECK-NEXT: .short 1 @ 0x1
809 ; CHECK-NEXT: .short 4 @ 0x4
810 ; CHECK-NEXT: .short 7 @ 0x7
811 ; CHECK-NEXT: .short 10 @ 0xa
812 ; CHECK-NEXT: .short 13 @ 0xd
813 ; CHECK-NEXT: .short 16 @ 0x10
814 ; CHECK-NEXT: .short 19 @ 0x13
815 ; CHECK-NEXT: .short 22 @ 0x16
816 ; CHECK-NEXT: .LCPI12_1:
817 ; CHECK-NEXT: .short 0 @ 0x0
818 ; CHECK-NEXT: .short 3 @ 0x3
819 ; CHECK-NEXT: .short 6 @ 0x6
820 ; CHECK-NEXT: .short 9 @ 0x9
821 ; CHECK-NEXT: .short 12 @ 0xc
822 ; CHECK-NEXT: .short 15 @ 0xf
823 ; CHECK-NEXT: .short 18 @ 0x12
824 ; CHECK-NEXT: .short 21 @ 0x15
825 ; CHECK-NEXT: .LCPI12_2:
826 ; CHECK-NEXT: .short 2 @ 0x2
827 ; CHECK-NEXT: .short 5 @ 0x5
828 ; CHECK-NEXT: .short 8 @ 0x8
829 ; CHECK-NEXT: .short 11 @ 0xb
830 ; CHECK-NEXT: .short 14 @ 0xe
831 ; CHECK-NEXT: .short 17 @ 0x11
832 ; CHECK-NEXT: .short 20 @ 0x14
833 ; CHECK-NEXT: .short 23 @ 0x17
837 %cmp22 = icmp sgt i32 %n, 0
838 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
840 vector.ph: ; preds = %for.body.preheader
841 %n.vec = and i32 %n, -8
842 br label %vector.body
844 vector.body: ; preds = %vector.body, %vector.ph
845 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
846 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ]
847 %0 = mul nuw nsw <8 x i16> %vec.ind, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
848 %1 = getelementptr inbounds i16, ptr %data, <8 x i16> %0
849 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
850 %2 = add nuw nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
851 %3 = getelementptr inbounds i16, ptr %data, <8 x i16> %2
852 %wide.masked.gather24 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %3, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
853 %4 = add nuw nsw <8 x i16> %0, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
854 %5 = getelementptr inbounds i16, ptr %data, <8 x i16> %4
855 %wide.masked.gather25 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %5, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
856 %6 = add nsw <8 x i16> %wide.masked.gather24, %wide.masked.gather
857 %7 = add nsw <8 x i16> %6, %wide.masked.gather25
858 %8 = getelementptr inbounds i16, ptr %dst, i32 %index
859 store <8 x i16> %7, ptr %8, align 2
860 %index.next = add i32 %index, 8
861 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
862 %9 = icmp eq i32 %index.next, %n.vec
863 br i1 %9, label %middle.block, label %vector.body
865 middle.block: ; preds = %vector.body
866 %cmp.n = icmp eq i32 %n.vec, %n
867 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
869 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
874 define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
875 ; CHECK-LABEL: gather_inc_v16i8_complex:
876 ; CHECK: @ %bb.0: @ %entry
877 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
878 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
879 ; CHECK-NEXT: .pad #4
880 ; CHECK-NEXT: sub sp, #4
881 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
882 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
883 ; CHECK-NEXT: .pad #312
884 ; CHECK-NEXT: sub sp, #312
885 ; CHECK-NEXT: cmp r2, #1
886 ; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill
887 ; CHECK-NEXT: blt.w .LBB13_5
888 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
889 ; CHECK-NEXT: adr r1, .LCPI13_0
890 ; CHECK-NEXT: adr r6, .LCPI13_8
891 ; CHECK-NEXT: vldrw.u32 q0, [r1]
892 ; CHECK-NEXT: adr r1, .LCPI13_1
893 ; CHECK-NEXT: adr r7, .LCPI13_7
894 ; CHECK-NEXT: adr r3, .LCPI13_6
895 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
896 ; CHECK-NEXT: vldrw.u32 q0, [r1]
897 ; CHECK-NEXT: adr r1, .LCPI13_5
898 ; CHECK-NEXT: bic r10, r2, #7
899 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
900 ; CHECK-NEXT: vldrw.u32 q0, [r6]
901 ; CHECK-NEXT: adr r6, .LCPI13_9
902 ; CHECK-NEXT: vmov.i32 q2, #0x30
903 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
904 ; CHECK-NEXT: vldrw.u32 q0, [r7]
905 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
906 ; CHECK-NEXT: vldrw.u32 q0, [r6]
907 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
908 ; CHECK-NEXT: vldrw.u32 q0, [r1]
909 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
910 ; CHECK-NEXT: vldrw.u32 q0, [r3]
911 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
912 ; CHECK-NEXT: .LBB13_2: @ %vector.ph
913 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
914 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2
915 ; CHECK-NEXT: adr r1, .LCPI13_3
916 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
917 ; CHECK-NEXT: vldrw.u32 q0, [r1]
918 ; CHECK-NEXT: adr r1, .LCPI13_4
919 ; CHECK-NEXT: vldrw.u32 q5, [r1]
920 ; CHECK-NEXT: adr r1, .LCPI13_2
921 ; CHECK-NEXT: vldrw.u32 q3, [r1]
922 ; CHECK-NEXT: adr r1, .LCPI13_10
923 ; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill
924 ; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
925 ; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
926 ; CHECK-NEXT: vldrw.u32 q3, [r1]
927 ; CHECK-NEXT: adr r1, .LCPI13_11
928 ; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload
929 ; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill
930 ; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
931 ; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill
932 ; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
933 ; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
934 ; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
935 ; CHECK-NEXT: vldrw.u32 q7, [r1]
936 ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
937 ; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill
938 ; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload
939 ; CHECK-NEXT: mov r11, r10
940 ; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
941 ; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill
942 ; CHECK-NEXT: .LBB13_3: @ %vector.body
943 ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1
944 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
945 ; CHECK-NEXT: vadd.i32 q4, q1, r0
946 ; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill
947 ; CHECK-NEXT: vmov r1, lr, d8
948 ; CHECK-NEXT: vadd.i32 q7, q7, r0
949 ; CHECK-NEXT: vmov r5, r4, d15
950 ; CHECK-NEXT: vadd.i32 q6, q0, r0
951 ; CHECK-NEXT: vmov r6, r7, d13
952 ; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill
953 ; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
954 ; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill
955 ; CHECK-NEXT: vmov q0, q2
956 ; CHECK-NEXT: vmov q3, q5
957 ; CHECK-NEXT: vadd.i32 q1, q1, r0
958 ; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload
959 ; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload
960 ; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill
961 ; CHECK-NEXT: vadd.i32 q0, q0, r0
962 ; CHECK-NEXT: subs.w r11, r11, #16
963 ; CHECK-NEXT: ldrb.w r9, [r1]
964 ; CHECK-NEXT: vmov r1, r3, d14
965 ; CHECK-NEXT: ldrb r5, [r5]
966 ; CHECK-NEXT: ldrb r7, [r7]
967 ; CHECK-NEXT: ldrb r1, [r1]
968 ; CHECK-NEXT: vmov.8 q7[0], r1
969 ; CHECK-NEXT: ldrb r1, [r3]
970 ; CHECK-NEXT: vmov.8 q7[1], r1
971 ; CHECK-NEXT: vmov r1, r3, d12
972 ; CHECK-NEXT: vmov.8 q7[2], r5
973 ; CHECK-NEXT: ldrb r5, [r6]
974 ; CHECK-NEXT: ldrb r6, [r4]
975 ; CHECK-NEXT: vmov.8 q7[3], r6
976 ; CHECK-NEXT: ldrb r1, [r1]
977 ; CHECK-NEXT: ldrb r3, [r3]
978 ; CHECK-NEXT: vmov.8 q6[0], r1
979 ; CHECK-NEXT: vmov r6, r1, d2
980 ; CHECK-NEXT: vmov.8 q6[1], r3
981 ; CHECK-NEXT: vmov.8 q6[2], r5
982 ; CHECK-NEXT: vmov.8 q6[3], r7
983 ; CHECK-NEXT: ldrb.w r7, [lr]
984 ; CHECK-NEXT: vmov.8 q6[4], r9
985 ; CHECK-NEXT: vmov.8 q6[5], r7
986 ; CHECK-NEXT: ldrb r4, [r1]
987 ; CHECK-NEXT: vmov r1, r5, d3
988 ; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload
989 ; CHECK-NEXT: ldrb.w r12, [r1]
990 ; CHECK-NEXT: vmov r1, r3, d9
991 ; CHECK-NEXT: ldrb r5, [r5]
992 ; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload
993 ; CHECK-NEXT: ldrb r1, [r1]
994 ; CHECK-NEXT: ldrb r3, [r3]
995 ; CHECK-NEXT: vmov.8 q6[6], r1
996 ; CHECK-NEXT: vmov r1, r7, d0
997 ; CHECK-NEXT: vmov.8 q6[7], r3
998 ; CHECK-NEXT: ldrb r1, [r1]
999 ; CHECK-NEXT: ldrb r7, [r7]
1000 ; CHECK-NEXT: vmov.8 q7[4], r1
1001 ; CHECK-NEXT: vmov r1, r3, d1
1002 ; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload
1003 ; CHECK-NEXT: vmov.8 q7[5], r7
1004 ; CHECK-NEXT: vadd.i32 q0, q0, r0
1005 ; CHECK-NEXT: ldrb r1, [r1]
1006 ; CHECK-NEXT: ldrb r3, [r3]
1007 ; CHECK-NEXT: vmov.8 q7[6], r1
1008 ; CHECK-NEXT: ldrb r1, [r6]
1009 ; CHECK-NEXT: vmov r7, r6, d0
1010 ; CHECK-NEXT: vmov.8 q7[7], r3
1011 ; CHECK-NEXT: vmov r3, lr, d1
1012 ; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
1013 ; CHECK-NEXT: vmov.8 q7[8], r1
1014 ; CHECK-NEXT: vadd.i32 q0, q0, r0
1015 ; CHECK-NEXT: vmov.8 q7[9], r4
1016 ; CHECK-NEXT: vmov r4, r1, d0
1017 ; CHECK-NEXT: vmov.8 q7[10], r12
1018 ; CHECK-NEXT: vmov.8 q7[11], r5
1019 ; CHECK-NEXT: ldrb r7, [r7]
1020 ; CHECK-NEXT: ldrb r6, [r6]
1021 ; CHECK-NEXT: ldrb r3, [r3]
1022 ; CHECK-NEXT: ldrb r4, [r4]
1023 ; CHECK-NEXT: ldrb r1, [r1]
1024 ; CHECK-NEXT: vmov.8 q6[8], r4
1025 ; CHECK-NEXT: vmov r5, r4, d1
1026 ; CHECK-NEXT: vmov.8 q6[9], r1
1027 ; CHECK-NEXT: vadd.i32 q0, q5, r0
1028 ; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
1029 ; CHECK-NEXT: ldrb r5, [r5]
1030 ; CHECK-NEXT: ldrb r4, [r4]
1031 ; CHECK-NEXT: vmov.8 q6[10], r5
1032 ; CHECK-NEXT: vmov.8 q6[11], r4
1033 ; CHECK-NEXT: vmov.8 q6[12], r7
1034 ; CHECK-NEXT: vmov.8 q6[13], r6
1035 ; CHECK-NEXT: vmov.8 q6[14], r3
1036 ; CHECK-NEXT: vmov r1, r3, d0
1037 ; CHECK-NEXT: ldrb r1, [r1]
1038 ; CHECK-NEXT: vmov.8 q7[12], r1
1039 ; CHECK-NEXT: ldrb r1, [r3]
1040 ; CHECK-NEXT: vmov.8 q7[13], r1
1041 ; CHECK-NEXT: vmov r1, r3, d1
1042 ; CHECK-NEXT: vadd.i32 q0, q1, r0
1043 ; CHECK-NEXT: vadd.i32 q1, q1, q2
1044 ; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill
1045 ; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload
1046 ; CHECK-NEXT: vadd.i32 q1, q1, q2
1047 ; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill
1048 ; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
1049 ; CHECK-NEXT: vadd.i32 q1, q1, q2
1050 ; CHECK-NEXT: ldrb r1, [r1]
1051 ; CHECK-NEXT: vmov.8 q7[14], r1
1052 ; CHECK-NEXT: ldrb r1, [r3]
1053 ; CHECK-NEXT: vmov.8 q7[15], r1
1054 ; CHECK-NEXT: ldrb.w r1, [lr]
1055 ; CHECK-NEXT: vmov.8 q6[15], r1
1056 ; CHECK-NEXT: vmov r1, r3, d0
1057 ; CHECK-NEXT: vadd.i8 q6, q6, q7
1058 ; CHECK-NEXT: ldrb r1, [r1]
1059 ; CHECK-NEXT: ldrb r3, [r3]
1060 ; CHECK-NEXT: vmov.8 q7[0], r1
1061 ; CHECK-NEXT: vmov.8 q7[1], r3
1062 ; CHECK-NEXT: vmov r1, r3, d1
1063 ; CHECK-NEXT: vadd.i32 q0, q3, r0
1064 ; CHECK-NEXT: vadd.i32 q3, q3, q2
1065 ; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
1066 ; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload
1067 ; CHECK-NEXT: vadd.i32 q3, q3, q2
1068 ; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
1069 ; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload
1070 ; CHECK-NEXT: vadd.i32 q3, q3, q2
1071 ; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill
1072 ; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload
1073 ; CHECK-NEXT: vadd.i32 q3, q3, q2
1074 ; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill
1075 ; CHECK-NEXT: ldrb r1, [r1]
1076 ; CHECK-NEXT: vmov.8 q7[2], r1
1077 ; CHECK-NEXT: ldrb r1, [r3]
1078 ; CHECK-NEXT: vmov.8 q7[3], r1
1079 ; CHECK-NEXT: vmov r1, r3, d0
1080 ; CHECK-NEXT: ldrb r1, [r1]
1081 ; CHECK-NEXT: vmov.8 q7[4], r1
1082 ; CHECK-NEXT: ldrb r1, [r3]
1083 ; CHECK-NEXT: vmov.8 q7[5], r1
1084 ; CHECK-NEXT: vmov r1, r3, d1
1085 ; CHECK-NEXT: vadd.i32 q0, q5, r0
1086 ; CHECK-NEXT: vadd.i32 q5, q5, q2
1087 ; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
1088 ; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload
1089 ; CHECK-NEXT: vadd.i32 q5, q5, q2
1090 ; CHECK-NEXT: ldrb r1, [r1]
1091 ; CHECK-NEXT: vmov.8 q7[6], r1
1092 ; CHECK-NEXT: ldrb r1, [r3]
1093 ; CHECK-NEXT: vmov.8 q7[7], r1
1094 ; CHECK-NEXT: vmov r1, r3, d0
1095 ; CHECK-NEXT: ldrb r1, [r1]
1096 ; CHECK-NEXT: vmov.8 q7[8], r1
1097 ; CHECK-NEXT: ldrb r1, [r3]
1098 ; CHECK-NEXT: vmov.8 q7[9], r1
1099 ; CHECK-NEXT: vmov r1, r3, d1
1100 ; CHECK-NEXT: vadd.i32 q0, q4, r0
1101 ; CHECK-NEXT: vadd.i32 q4, q4, q2
1102 ; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill
1103 ; CHECK-NEXT: ldrb r1, [r1]
1104 ; CHECK-NEXT: vmov.8 q7[10], r1
1105 ; CHECK-NEXT: ldrb r1, [r3]
1106 ; CHECK-NEXT: vmov.8 q7[11], r1
1107 ; CHECK-NEXT: vmov r1, r3, d0
1108 ; CHECK-NEXT: ldrb r1, [r1]
1109 ; CHECK-NEXT: vmov.8 q7[12], r1
1110 ; CHECK-NEXT: ldrb r1, [r3]
1111 ; CHECK-NEXT: vmov.8 q7[13], r1
1112 ; CHECK-NEXT: vmov r1, r3, d1
1113 ; CHECK-NEXT: ldrb r1, [r1]
1114 ; CHECK-NEXT: vmov.8 q7[14], r1
1115 ; CHECK-NEXT: ldrb r1, [r3]
1116 ; CHECK-NEXT: vmov.8 q7[15], r1
1117 ; CHECK-NEXT: vadd.i8 q0, q6, q7
1118 ; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload
1119 ; CHECK-NEXT: vstrb.8 q0, [r8], #16
1120 ; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload
1121 ; CHECK-NEXT: vadd.i32 q7, q7, q2
1122 ; CHECK-NEXT: vadd.i32 q0, q0, q2
1123 ; CHECK-NEXT: bne.w .LBB13_3
1124 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1125 ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1
1126 ; CHECK-NEXT: cmp r10, r2
1127 ; CHECK-NEXT: bne.w .LBB13_2
1128 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup
1129 ; CHECK-NEXT: add sp, #312
1130 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1131 ; CHECK-NEXT: add sp, #4
1132 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1133 ; CHECK-NEXT: .p2align 4
1134 ; CHECK-NEXT: @ %bb.6:
1135 ; CHECK-NEXT: .LCPI13_0:
1136 ; CHECK-NEXT: .long 38 @ 0x26
1137 ; CHECK-NEXT: .long 41 @ 0x29
1138 ; CHECK-NEXT: .long 44 @ 0x2c
1139 ; CHECK-NEXT: .long 47 @ 0x2f
1140 ; CHECK-NEXT: .LCPI13_1:
1141 ; CHECK-NEXT: .long 14 @ 0xe
1142 ; CHECK-NEXT: .long 17 @ 0x11
1143 ; CHECK-NEXT: .long 20 @ 0x14
1144 ; CHECK-NEXT: .long 23 @ 0x17
1145 ; CHECK-NEXT: .LCPI13_2:
1146 ; CHECK-NEXT: .long 24 @ 0x18
1147 ; CHECK-NEXT: .long 27 @ 0x1b
1148 ; CHECK-NEXT: .long 30 @ 0x1e
1149 ; CHECK-NEXT: .long 33 @ 0x21
1150 ; CHECK-NEXT: .LCPI13_3:
1151 ; CHECK-NEXT: .long 1 @ 0x1
1152 ; CHECK-NEXT: .long 4 @ 0x4
1153 ; CHECK-NEXT: .long 7 @ 0x7
1154 ; CHECK-NEXT: .long 10 @ 0xa
1155 ; CHECK-NEXT: .LCPI13_4:
1156 ; CHECK-NEXT: .long 36 @ 0x24
1157 ; CHECK-NEXT: .long 39 @ 0x27
1158 ; CHECK-NEXT: .long 42 @ 0x2a
1159 ; CHECK-NEXT: .long 45 @ 0x2d
1160 ; CHECK-NEXT: .LCPI13_5:
1161 ; CHECK-NEXT: .long 25 @ 0x19
1162 ; CHECK-NEXT: .long 28 @ 0x1c
1163 ; CHECK-NEXT: .long 31 @ 0x1f
1164 ; CHECK-NEXT: .long 34 @ 0x22
1165 ; CHECK-NEXT: .LCPI13_6:
1166 ; CHECK-NEXT: .long 13 @ 0xd
1167 ; CHECK-NEXT: .long 16 @ 0x10
1168 ; CHECK-NEXT: .long 19 @ 0x13
1169 ; CHECK-NEXT: .long 22 @ 0x16
1170 ; CHECK-NEXT: .LCPI13_7:
1171 ; CHECK-NEXT: .long 2 @ 0x2
1172 ; CHECK-NEXT: .long 5 @ 0x5
1173 ; CHECK-NEXT: .long 8 @ 0x8
1174 ; CHECK-NEXT: .long 11 @ 0xb
1175 ; CHECK-NEXT: .LCPI13_8:
1176 ; CHECK-NEXT: .long 26 @ 0x1a
1177 ; CHECK-NEXT: .long 29 @ 0x1d
1178 ; CHECK-NEXT: .long 32 @ 0x20
1179 ; CHECK-NEXT: .long 35 @ 0x23
1180 ; CHECK-NEXT: .LCPI13_9:
1181 ; CHECK-NEXT: .long 37 @ 0x25
1182 ; CHECK-NEXT: .long 40 @ 0x28
1183 ; CHECK-NEXT: .long 43 @ 0x2b
1184 ; CHECK-NEXT: .long 46 @ 0x2e
1185 ; CHECK-NEXT: .LCPI13_10:
1186 ; CHECK-NEXT: .long 12 @ 0xc
1187 ; CHECK-NEXT: .long 15 @ 0xf
1188 ; CHECK-NEXT: .long 18 @ 0x12
1189 ; CHECK-NEXT: .long 21 @ 0x15
1190 ; CHECK-NEXT: .LCPI13_11:
1191 ; CHECK-NEXT: .long 0 @ 0x0
1192 ; CHECK-NEXT: .long 3 @ 0x3
1193 ; CHECK-NEXT: .long 6 @ 0x6
1194 ; CHECK-NEXT: .long 9 @ 0x9
1198 %cmp22 = icmp sgt i32 %n, 0
1199 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
1201 vector.ph: ; preds = %for.body.preheader
1202 %n.vec = and i32 %n, -8
1203 br label %vector.body
1205 vector.body: ; preds = %vector.body, %vector.ph
1206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1207 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1208 %0 = mul nuw nsw <16 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1209 %1 = getelementptr inbounds i8, ptr %data, <16 x i32> %0
1210 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1211 %2 = add nuw nsw <16 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1212 %3 = getelementptr inbounds i8, ptr %data, <16 x i32> %2
1213 %wide.masked.gather24 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %3, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1214 %4 = add nuw nsw <16 x i32> %0, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1215 %5 = getelementptr inbounds i8, ptr %data, <16 x i32> %4
1216 %wide.masked.gather25 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %5, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1217 %6 = add nsw <16 x i8> %wide.masked.gather24, %wide.masked.gather
1218 %7 = add nsw <16 x i8> %6, %wide.masked.gather25
1219 %8 = getelementptr inbounds i8, ptr %dst, i32 %index
1220 store <16 x i8> %7, ptr %8, align 2
1221 %index.next = add i32 %index, 16
1222 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1223 %9 = icmp eq i32 %index.next, %n.vec
1224 br i1 %9, label %middle.block, label %vector.body
1226 middle.block: ; preds = %vector.body
1227 %cmp.n = icmp eq i32 %n.vec, %n
1228 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
1230 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1234 define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
1235 ; CHECK-LABEL: gather_inc_v16i8_simple:
1236 ; CHECK: @ %bb.0: @ %entry
1237 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1238 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1239 ; CHECK-NEXT: .pad #4
1240 ; CHECK-NEXT: sub sp, #4
1241 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1242 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1243 ; CHECK-NEXT: .pad #64
1244 ; CHECK-NEXT: sub sp, #64
1245 ; CHECK-NEXT: cmp r2, #1
1246 ; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill
1247 ; CHECK-NEXT: mov r1, r2
1248 ; CHECK-NEXT: str r2, [sp, #60] @ 4-byte Spill
1249 ; CHECK-NEXT: blt.w .LBB14_5
1250 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
1251 ; CHECK-NEXT: adr r5, .LCPI14_3
1252 ; CHECK-NEXT: adr r7, .LCPI14_1
1253 ; CHECK-NEXT: vldrw.u32 q0, [r5]
1254 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
1255 ; CHECK-NEXT: adr r3, .LCPI14_0
1256 ; CHECK-NEXT: adr r6, .LCPI14_2
1257 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
1258 ; CHECK-NEXT: vldrw.u32 q0, [r7]
1259 ; CHECK-NEXT: bic r9, r1, #7
1260 ; CHECK-NEXT: vldrw.u32 q3, [r3]
1261 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
1262 ; CHECK-NEXT: vldrw.u32 q0, [r6]
1263 ; CHECK-NEXT: mov.w lr, #16
1264 ; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill
1265 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
1266 ; CHECK-NEXT: .LBB14_2: @ %vector.ph
1267 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
1268 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2
1269 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload
1270 ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
1271 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
1272 ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
1273 ; CHECK-NEXT: vmov q4, q3
1274 ; CHECK-NEXT: .LBB14_3: @ %vector.body
1275 ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1
1276 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
1277 ; CHECK-NEXT: vadd.i32 q1, q5, r0
1278 ; CHECK-NEXT: vadd.i32 q2, q4, r0
1279 ; CHECK-NEXT: vmov r7, r3, d3
1280 ; CHECK-NEXT: vadd.i32 q6, q0, lr
1281 ; CHECK-NEXT: vmov r5, r6, d5
1282 ; CHECK-NEXT: subs.w r9, r9, #16
1283 ; CHECK-NEXT: vmov r4, r10, d2
1284 ; CHECK-NEXT: vadd.i32 q1, q7, lr
1285 ; CHECK-NEXT: vadd.i32 q4, q4, lr
1286 ; CHECK-NEXT: vadd.i32 q5, q5, lr
1287 ; CHECK-NEXT: ldrb.w r11, [r3]
1288 ; CHECK-NEXT: ldrb r3, [r7]
1289 ; CHECK-NEXT: vmov r7, r12, d4
1290 ; CHECK-NEXT: vadd.i32 q2, q7, r0
1291 ; CHECK-NEXT: vadd.i32 q7, q0, r0
1292 ; CHECK-NEXT: ldrb r5, [r5]
1293 ; CHECK-NEXT: ldrb r6, [r6]
1294 ; CHECK-NEXT: ldrb r4, [r4]
1295 ; CHECK-NEXT: ldrb.w r10, [r10]
1296 ; CHECK-NEXT: ldrb r7, [r7]
1297 ; CHECK-NEXT: ldrb.w r1, [r12]
1298 ; CHECK-NEXT: vmov.8 q0[0], r7
1299 ; CHECK-NEXT: vmov.8 q0[1], r1
1300 ; CHECK-NEXT: vmov r1, r7, d15
1301 ; CHECK-NEXT: vmov.8 q0[2], r5
1302 ; CHECK-NEXT: vmov.8 q0[3], r6
1303 ; CHECK-NEXT: vmov.8 q0[4], r4
1304 ; CHECK-NEXT: vmov r4, r2, d4
1305 ; CHECK-NEXT: vmov.8 q0[5], r10
1306 ; CHECK-NEXT: vmov.8 q0[6], r3
1307 ; CHECK-NEXT: vmov.8 q0[7], r11
1308 ; CHECK-NEXT: ldrb r6, [r7]
1309 ; CHECK-NEXT: vmov r5, r7, d5
1310 ; CHECK-NEXT: ldrb r1, [r1]
1311 ; CHECK-NEXT: ldrb r2, [r2]
1312 ; CHECK-NEXT: ldrb r3, [r5]
1313 ; CHECK-NEXT: ldrb.w r12, [r7]
1314 ; CHECK-NEXT: ldrb r5, [r4]
1315 ; CHECK-NEXT: vmov r4, r7, d14
1316 ; CHECK-NEXT: vmov q7, q1
1317 ; CHECK-NEXT: ldrb r4, [r4]
1318 ; CHECK-NEXT: ldrb r7, [r7]
1319 ; CHECK-NEXT: vmov.8 q0[8], r4
1320 ; CHECK-NEXT: vmov.8 q0[9], r7
1321 ; CHECK-NEXT: vmov.8 q0[10], r1
1322 ; CHECK-NEXT: vmov.8 q0[11], r6
1323 ; CHECK-NEXT: vmov.8 q0[12], r5
1324 ; CHECK-NEXT: vmov.8 q0[13], r2
1325 ; CHECK-NEXT: vmov.8 q0[14], r3
1326 ; CHECK-NEXT: vmov.8 q0[15], r12
1327 ; CHECK-NEXT: vstrb.8 q0, [r8], #16
1328 ; CHECK-NEXT: vmov q0, q6
1329 ; CHECK-NEXT: bne .LBB14_3
1330 ; CHECK-NEXT: @ %bb.4: @ %middle.block
1331 ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1
1332 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
1333 ; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload
1334 ; CHECK-NEXT: cmp r9, r1
1335 ; CHECK-NEXT: bne .LBB14_2
1336 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup
1337 ; CHECK-NEXT: add sp, #64
1338 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1339 ; CHECK-NEXT: add sp, #4
1340 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1341 ; CHECK-NEXT: .p2align 4
1342 ; CHECK-NEXT: @ %bb.6:
1343 ; CHECK-NEXT: .LCPI14_0:
1344 ; CHECK-NEXT: .long 0 @ 0x0
1345 ; CHECK-NEXT: .long 1 @ 0x1
1346 ; CHECK-NEXT: .long 2 @ 0x2
1347 ; CHECK-NEXT: .long 3 @ 0x3
1348 ; CHECK-NEXT: .LCPI14_1:
1349 ; CHECK-NEXT: .long 8 @ 0x8
1350 ; CHECK-NEXT: .long 9 @ 0x9
1351 ; CHECK-NEXT: .long 10 @ 0xa
1352 ; CHECK-NEXT: .long 11 @ 0xb
1353 ; CHECK-NEXT: .LCPI14_2:
1354 ; CHECK-NEXT: .long 4 @ 0x4
1355 ; CHECK-NEXT: .long 5 @ 0x5
1356 ; CHECK-NEXT: .long 6 @ 0x6
1357 ; CHECK-NEXT: .long 7 @ 0x7
1358 ; CHECK-NEXT: .LCPI14_3:
1359 ; CHECK-NEXT: .long 12 @ 0xc
1360 ; CHECK-NEXT: .long 13 @ 0xd
1361 ; CHECK-NEXT: .long 14 @ 0xe
1362 ; CHECK-NEXT: .long 15 @ 0xf
1366 %cmp22 = icmp sgt i32 %n, 0
1367 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
1369 vector.ph: ; preds = %for.body.preheader
1370 %n.vec = and i32 %n, -8
1371 br label %vector.body
1373 vector.body: ; preds = %vector.body, %vector.ph
1374 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1375 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1376 %0 = getelementptr inbounds i8, ptr %data, <16 x i32> %vec.ind
1377 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %0, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1378 %1 = getelementptr inbounds i8, ptr %dst, i32 %index
1379 store <16 x i8> %wide.masked.gather, ptr %1, align 2
1380 %index.next = add i32 %index, 16
1381 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1382 %2 = icmp eq i32 %index.next, %n.vec
1383 br i1 %2, label %middle.block, label %vector.body
1385 middle.block: ; preds = %vector.body
1386 %cmp.n = icmp eq i32 %n.vec, %n
1387 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
1389 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1393 define void @shl(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) {
1395 ; CHECK: @ %bb.0: @ %entry
1396 ; CHECK-NEXT: .save {r7, lr}
1397 ; CHECK-NEXT: push {r7, lr}
1398 ; CHECK-NEXT: cmp r2, #1
1400 ; CHECK-NEXT: poplt {r7, pc}
1401 ; CHECK-NEXT: .LBB15_1: @ %vector.ph
1402 ; CHECK-NEXT: adr r3, .LCPI15_0
1403 ; CHECK-NEXT: vldrw.u32 q0, [r3]
1404 ; CHECK-NEXT: vadd.i32 q0, q0, r1
1405 ; CHECK-NEXT: dlstp.32 lr, r2
1406 ; CHECK-NEXT: .LBB15_2: @ %vector.body
1407 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1408 ; CHECK-NEXT: vldrw.u32 q1, [q0, #64]!
1409 ; CHECK-NEXT: vstrw.32 q1, [r0], #16
1410 ; CHECK-NEXT: letp lr, .LBB15_2
1411 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1412 ; CHECK-NEXT: pop {r7, pc}
1413 ; CHECK-NEXT: .p2align 4
1414 ; CHECK-NEXT: @ %bb.4:
1415 ; CHECK-NEXT: .LCPI15_0:
1416 ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0
1417 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
1418 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
1419 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
1421 %cmp6 = icmp sgt i32 %n, 0
1422 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
1424 vector.ph: ; preds = %entry
1425 %n.rnd.up = add i32 %n, 3
1426 %n.vec = and i32 %n.rnd.up, -4
1427 br label %vector.body
1429 vector.body: ; preds = %vector.body, %vector.ph
1430 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1431 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1432 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1433 %0 = shl nsw <4 x i32> %vec.ind, <i32 2, i32 2, i32 2, i32 2>
1434 %1 = getelementptr inbounds i32, ptr %y, <4 x i32> %0
1435 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1436 %2 = getelementptr inbounds i32, ptr %x, i32 %index
1437 call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %2, i32 4, <4 x i1> %active.lane.mask)
1438 %index.next = add i32 %index, 4
1439 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
1440 %3 = icmp eq i32 %index.next, %n.vec
1441 br i1 %3, label %for.cond.cleanup, label %vector.body
1443 for.cond.cleanup: ; preds = %vector.body, %entry
1447 define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) {
1448 ; CHECK-LABEL: shlor:
1449 ; CHECK: @ %bb.0: @ %entry
1450 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1451 ; CHECK-NEXT: push {r4, r5, r6, lr}
1452 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
1453 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
1454 ; CHECK-NEXT: cmp r2, #1
1455 ; CHECK-NEXT: blt .LBB16_3
1456 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1457 ; CHECK-NEXT: adr.w lr, .LCPI16_0
1458 ; CHECK-NEXT: adr r4, .LCPI16_1
1459 ; CHECK-NEXT: adr r5, .LCPI16_2
1460 ; CHECK-NEXT: adr r6, .LCPI16_3
1461 ; CHECK-NEXT: vldrw.u32 q0, [r6]
1462 ; CHECK-NEXT: vldrw.u32 q1, [r5]
1463 ; CHECK-NEXT: vldrw.u32 q2, [r4]
1464 ; CHECK-NEXT: vldrw.u32 q3, [lr]
1465 ; CHECK-NEXT: vadd.i32 q0, q0, r1
1466 ; CHECK-NEXT: vadd.i32 q1, q1, r1
1467 ; CHECK-NEXT: vadd.i32 q2, q2, r1
1468 ; CHECK-NEXT: vadd.i32 q3, q3, r1
1469 ; CHECK-NEXT: dlstp.32 lr, r2
1470 ; CHECK-NEXT: .LBB16_2: @ %vector.body
1471 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1472 ; CHECK-NEXT: vldrw.u32 q4, [q3, #128]!
1473 ; CHECK-NEXT: vldrw.u32 q5, [q2, #128]!
1474 ; CHECK-NEXT: vldrw.u32 q6, [q0, #128]!
1475 ; CHECK-NEXT: vadd.i32 q4, q5, q4
1476 ; CHECK-NEXT: vldrw.u32 q5, [q1, #128]!
1477 ; CHECK-NEXT: vadd.i32 q4, q4, q5
1478 ; CHECK-NEXT: vadd.i32 q4, q4, q6
1479 ; CHECK-NEXT: vstrw.32 q4, [r0], #16
1480 ; CHECK-NEXT: letp lr, .LBB16_2
1481 ; CHECK-NEXT: .LBB16_3: @ %for.cond.cleanup
1482 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
1483 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1484 ; CHECK-NEXT: .p2align 4
1485 ; CHECK-NEXT: @ %bb.4:
1486 ; CHECK-NEXT: .LCPI16_0:
1487 ; CHECK-NEXT: .long 4294967168 @ 0xffffff80
1488 ; CHECK-NEXT: .long 4294967200 @ 0xffffffa0
1489 ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0
1490 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
1491 ; CHECK-NEXT: .LCPI16_1:
1492 ; CHECK-NEXT: .long 4294967176 @ 0xffffff88
1493 ; CHECK-NEXT: .long 4294967208 @ 0xffffffa8
1494 ; CHECK-NEXT: .long 4294967240 @ 0xffffffc8
1495 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
1496 ; CHECK-NEXT: .LCPI16_2:
1497 ; CHECK-NEXT: .long 4294967184 @ 0xffffff90
1498 ; CHECK-NEXT: .long 4294967216 @ 0xffffffb0
1499 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
1500 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
1501 ; CHECK-NEXT: .LCPI16_3:
1502 ; CHECK-NEXT: .long 4294967192 @ 0xffffff98
1503 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
1504 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
1505 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
1507 %cmp23 = icmp sgt i32 %n, 0
1508 br i1 %cmp23, label %vector.ph, label %for.cond.cleanup
1510 vector.ph: ; preds = %entry
1511 %n.rnd.up = add i32 %n, 3
1512 %n.vec = and i32 %n.rnd.up, -4
1513 br label %vector.body
1515 vector.body: ; preds = %vector.body, %vector.ph
1516 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1517 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
1518 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1519 %0 = shl nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
1520 %1 = getelementptr inbounds i32, ptr %y, <4 x i32> %0
1521 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1522 %2 = or <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
1523 %3 = getelementptr inbounds i32, ptr %y, <4 x i32> %2
1524 %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1525 %4 = add nsw <4 x i32> %wide.masked.gather25, %wide.masked.gather
1526 %5 = or <4 x i32> %0, <i32 4, i32 4, i32 4, i32 4>
1527 %6 = getelementptr inbounds i32, ptr %y, <4 x i32> %5
1528 %wide.masked.gather26 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %6, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1529 %7 = add nsw <4 x i32> %4, %wide.masked.gather26
1530 %8 = or <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
1531 %9 = getelementptr inbounds i32, ptr %y, <4 x i32> %8
1532 %wide.masked.gather27 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %9, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1533 %10 = add nsw <4 x i32> %7, %wide.masked.gather27
1534 %11 = getelementptr inbounds i32, ptr %x, i32 %index
1535 call void @llvm.masked.store.v4i32.p0(<4 x i32> %10, ptr %11, i32 4, <4 x i1> %active.lane.mask)
1536 %index.next = add i32 %index, 4
1537 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
1538 %12 = icmp eq i32 %index.next, %n.vec
1539 br i1 %12, label %for.cond.cleanup, label %vector.body
1541 for.cond.cleanup: ; preds = %vector.body, %entry
1546 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
1547 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
1548 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
1549 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
1550 declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
1551 declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
1552 declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
1553 declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>)
1554 declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
1555 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
1556 declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
1557 declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>)
1558 declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>)
1559 declare <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x half>)
1560 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
1561 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
1562 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
1563 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
1564 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
1565 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)