1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s --verify-machineinstrs -o - | FileCheck %s
4 define dso_local arm_aapcs_vfpcc void @sink_shl_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) {
5 ; CHECK-LABEL: sink_shl_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: bic r3, r3, #3
10 ; CHECK-NEXT: sub.w r12, r3, #4
11 ; CHECK-NEXT: movs r3, #1
12 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
13 ; CHECK-NEXT: .LBB0_1: @ %vector.body
14 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
16 ; CHECK-NEXT: vshl.u32 q0, r2
17 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
18 ; CHECK-NEXT: le lr, .LBB0_1
19 ; CHECK-NEXT: @ %bb.2: @ %exit
20 ; CHECK-NEXT: pop {r7, pc}
25 %n.vec = and i32 %N, -4
26 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
27 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
30 vector.body: ; preds = %vector.body, %vector.ph
31 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
32 %gep.in = getelementptr inbounds i32, i32* %in, i32 %index
33 %cast.in = bitcast i32* %gep.in to <4 x i32>*
34 %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4
35 %res = shl <4 x i32> %wide.load, %broadcast.splat11
36 %gep.out = getelementptr inbounds i32, i32* %out, i32 %index
37 %cast.out = bitcast i32* %gep.out to <4 x i32>*
38 store <4 x i32> %res, <4 x i32>* %cast.out, align 4
39 %index.next = add i32 %index, 4
40 %cmp = icmp eq i32 %index.next, %n.vec
41 br i1 %cmp, label %exit, label %vector.body
47 define dso_local arm_aapcs_vfpcc void @sink_shl_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) {
48 ; CHECK-LABEL: sink_shl_i16:
49 ; CHECK: @ %bb.0: @ %entry
50 ; CHECK-NEXT: .save {r7, lr}
51 ; CHECK-NEXT: push {r7, lr}
52 ; CHECK-NEXT: bic r3, r3, #3
53 ; CHECK-NEXT: sub.w r12, r3, #4
54 ; CHECK-NEXT: movs r3, #1
55 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
56 ; CHECK-NEXT: .LBB1_1: @ %vector.body
57 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
58 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
59 ; CHECK-NEXT: vshl.u16 q0, r2
60 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
61 ; CHECK-NEXT: le lr, .LBB1_1
62 ; CHECK-NEXT: @ %bb.2: @ %exit
63 ; CHECK-NEXT: pop {r7, pc}
68 %n.vec = and i32 %N, -4
69 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
70 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
73 vector.body: ; preds = %vector.body, %vector.ph
74 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
75 %gep.in = getelementptr inbounds i16, i16* %in, i32 %index
76 %cast.in = bitcast i16* %gep.in to <8 x i16>*
77 %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4
78 %res = shl <8 x i16> %wide.load, %broadcast.splat11
79 %gep.out = getelementptr inbounds i16, i16* %out, i32 %index
80 %cast.out = bitcast i16* %gep.out to <8 x i16>*
81 store <8 x i16> %res, <8 x i16>* %cast.out, align 4
82 %index.next = add i32 %index, 4
83 %cmp = icmp eq i32 %index.next, %n.vec
84 br i1 %cmp, label %exit, label %vector.body
90 define dso_local arm_aapcs_vfpcc void @sink_shl_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) {
91 ; CHECK-LABEL: sink_shl_i8:
92 ; CHECK: @ %bb.0: @ %entry
93 ; CHECK-NEXT: .save {r7, lr}
94 ; CHECK-NEXT: push {r7, lr}
95 ; CHECK-NEXT: bic r3, r3, #3
96 ; CHECK-NEXT: sub.w r12, r3, #4
97 ; CHECK-NEXT: movs r3, #1
98 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
99 ; CHECK-NEXT: .LBB2_1: @ %vector.body
100 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
101 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
102 ; CHECK-NEXT: vshl.u8 q0, r2
103 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
104 ; CHECK-NEXT: le lr, .LBB2_1
105 ; CHECK-NEXT: @ %bb.2: @ %exit
106 ; CHECK-NEXT: pop {r7, pc}
111 %n.vec = and i32 %N, -4
112 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
113 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
114 br label %vector.body
116 vector.body: ; preds = %vector.body, %vector.ph
117 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
118 %gep.in = getelementptr inbounds i8, i8* %in, i32 %index
119 %cast.in = bitcast i8* %gep.in to <16 x i8>*
120 %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4
121 %res = shl <16 x i8> %wide.load, %broadcast.splat11
122 %gep.out = getelementptr inbounds i8, i8* %out, i32 %index
123 %cast.out = bitcast i8* %gep.out to <16 x i8>*
124 store <16 x i8> %res, <16 x i8>* %cast.out, align 4
125 %index.next = add i32 %index, 4
126 %cmp = icmp eq i32 %index.next, %n.vec
127 br i1 %cmp, label %exit, label %vector.body
133 define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) {
134 ; CHECK-LABEL: sink_lshr_i32:
135 ; CHECK: @ %bb.0: @ %entry
136 ; CHECK-NEXT: .save {r7, lr}
137 ; CHECK-NEXT: push {r7, lr}
138 ; CHECK-NEXT: bic r3, r3, #3
139 ; CHECK-NEXT: rsbs r2, r2, #0
140 ; CHECK-NEXT: sub.w r12, r3, #4
141 ; CHECK-NEXT: movs r3, #1
142 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
143 ; CHECK-NEXT: .LBB3_1: @ %vector.body
144 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
145 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
146 ; CHECK-NEXT: vshl.u32 q0, r2
147 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
148 ; CHECK-NEXT: le lr, .LBB3_1
149 ; CHECK-NEXT: @ %bb.2: @ %exit
150 ; CHECK-NEXT: pop {r7, pc}
155 %n.vec = and i32 %N, -4
156 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
157 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
158 br label %vector.body
160 vector.body: ; preds = %vector.body, %vector.ph
161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
162 %gep.in = getelementptr inbounds i32, i32* %in, i32 %index
163 %cast.in = bitcast i32* %gep.in to <4 x i32>*
164 %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4
165 %res = lshr <4 x i32> %wide.load, %broadcast.splat11
166 %gep.out = getelementptr inbounds i32, i32* %out, i32 %index
167 %cast.out = bitcast i32* %gep.out to <4 x i32>*
168 store <4 x i32> %res, <4 x i32>* %cast.out, align 4
169 %index.next = add i32 %index, 4
170 %cmp = icmp eq i32 %index.next, %n.vec
171 br i1 %cmp, label %exit, label %vector.body
177 define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) {
178 ; CHECK-LABEL: sink_lshr_i16:
179 ; CHECK: @ %bb.0: @ %entry
180 ; CHECK-NEXT: .save {r7, lr}
181 ; CHECK-NEXT: push {r7, lr}
182 ; CHECK-NEXT: bic r3, r3, #3
183 ; CHECK-NEXT: rsbs r2, r2, #0
184 ; CHECK-NEXT: sub.w r12, r3, #4
185 ; CHECK-NEXT: movs r3, #1
186 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
187 ; CHECK-NEXT: .LBB4_1: @ %vector.body
188 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
189 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
190 ; CHECK-NEXT: vshl.u16 q0, r2
191 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
192 ; CHECK-NEXT: le lr, .LBB4_1
193 ; CHECK-NEXT: @ %bb.2: @ %exit
194 ; CHECK-NEXT: pop {r7, pc}
199 %n.vec = and i32 %N, -4
200 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
201 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
202 br label %vector.body
204 vector.body: ; preds = %vector.body, %vector.ph
205 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
206 %gep.in = getelementptr inbounds i16, i16* %in, i32 %index
207 %cast.in = bitcast i16* %gep.in to <8 x i16>*
208 %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4
209 %res = lshr <8 x i16> %wide.load, %broadcast.splat11
210 %gep.out = getelementptr inbounds i16, i16* %out, i32 %index
211 %cast.out = bitcast i16* %gep.out to <8 x i16>*
212 store <8 x i16> %res, <8 x i16>* %cast.out, align 4
213 %index.next = add i32 %index, 4
214 %cmp = icmp eq i32 %index.next, %n.vec
215 br i1 %cmp, label %exit, label %vector.body
221 define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) {
222 ; CHECK-LABEL: sink_lshr_i8:
223 ; CHECK: @ %bb.0: @ %entry
224 ; CHECK-NEXT: .save {r7, lr}
225 ; CHECK-NEXT: push {r7, lr}
226 ; CHECK-NEXT: bic r3, r3, #3
227 ; CHECK-NEXT: rsbs r2, r2, #0
228 ; CHECK-NEXT: sub.w r12, r3, #4
229 ; CHECK-NEXT: movs r3, #1
230 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
231 ; CHECK-NEXT: .LBB5_1: @ %vector.body
232 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
233 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
234 ; CHECK-NEXT: vshl.u8 q0, r2
235 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
236 ; CHECK-NEXT: le lr, .LBB5_1
237 ; CHECK-NEXT: @ %bb.2: @ %exit
238 ; CHECK-NEXT: pop {r7, pc}
243 %n.vec = and i32 %N, -4
244 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
245 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
246 br label %vector.body
248 vector.body: ; preds = %vector.body, %vector.ph
249 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
250 %gep.in = getelementptr inbounds i8, i8* %in, i32 %index
251 %cast.in = bitcast i8* %gep.in to <16 x i8>*
252 %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4
253 %res = lshr <16 x i8> %wide.load, %broadcast.splat11
254 %gep.out = getelementptr inbounds i8, i8* %out, i32 %index
255 %cast.out = bitcast i8* %gep.out to <16 x i8>*
256 store <16 x i8> %res, <16 x i8>* %cast.out, align 4
257 %index.next = add i32 %index, 4
258 %cmp = icmp eq i32 %index.next, %n.vec
259 br i1 %cmp, label %exit, label %vector.body
265 define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) {
266 ; CHECK-LABEL: sink_ashr_i32:
267 ; CHECK: @ %bb.0: @ %entry
268 ; CHECK-NEXT: .save {r7, lr}
269 ; CHECK-NEXT: push {r7, lr}
270 ; CHECK-NEXT: bic r3, r3, #3
271 ; CHECK-NEXT: rsbs r2, r2, #0
272 ; CHECK-NEXT: sub.w r12, r3, #4
273 ; CHECK-NEXT: movs r3, #1
274 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
275 ; CHECK-NEXT: .LBB6_1: @ %vector.body
276 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
277 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
278 ; CHECK-NEXT: vshl.s32 q0, r2
279 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
280 ; CHECK-NEXT: le lr, .LBB6_1
281 ; CHECK-NEXT: @ %bb.2: @ %exit
282 ; CHECK-NEXT: pop {r7, pc}
287 %n.vec = and i32 %N, -4
288 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
289 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
290 br label %vector.body
292 vector.body: ; preds = %vector.body, %vector.ph
293 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
294 %gep.in = getelementptr inbounds i32, i32* %in, i32 %index
295 %cast.in = bitcast i32* %gep.in to <4 x i32>*
296 %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4
297 %res = ashr <4 x i32> %wide.load, %broadcast.splat11
298 %gep.out = getelementptr inbounds i32, i32* %out, i32 %index
299 %cast.out = bitcast i32* %gep.out to <4 x i32>*
300 store <4 x i32> %res, <4 x i32>* %cast.out, align 4
301 %index.next = add i32 %index, 4
302 %cmp = icmp eq i32 %index.next, %n.vec
303 br i1 %cmp, label %exit, label %vector.body
309 define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) {
310 ; CHECK-LABEL: sink_ashr_i16:
311 ; CHECK: @ %bb.0: @ %entry
312 ; CHECK-NEXT: .save {r7, lr}
313 ; CHECK-NEXT: push {r7, lr}
314 ; CHECK-NEXT: bic r3, r3, #3
315 ; CHECK-NEXT: rsbs r2, r2, #0
316 ; CHECK-NEXT: sub.w r12, r3, #4
317 ; CHECK-NEXT: movs r3, #1
318 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
319 ; CHECK-NEXT: .LBB7_1: @ %vector.body
320 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
321 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
322 ; CHECK-NEXT: vshl.s16 q0, r2
323 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
324 ; CHECK-NEXT: le lr, .LBB7_1
325 ; CHECK-NEXT: @ %bb.2: @ %exit
326 ; CHECK-NEXT: pop {r7, pc}
331 %n.vec = and i32 %N, -4
332 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
333 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
334 br label %vector.body
336 vector.body: ; preds = %vector.body, %vector.ph
337 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
338 %gep.in = getelementptr inbounds i16, i16* %in, i32 %index
339 %cast.in = bitcast i16* %gep.in to <8 x i16>*
340 %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4
341 %res = ashr <8 x i16> %wide.load, %broadcast.splat11
342 %gep.out = getelementptr inbounds i16, i16* %out, i32 %index
343 %cast.out = bitcast i16* %gep.out to <8 x i16>*
344 store <8 x i16> %res, <8 x i16>* %cast.out, align 4
345 %index.next = add i32 %index, 4
346 %cmp = icmp eq i32 %index.next, %n.vec
347 br i1 %cmp, label %exit, label %vector.body
353 define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) {
354 ; CHECK-LABEL: sink_ashr_i8:
355 ; CHECK: @ %bb.0: @ %entry
356 ; CHECK-NEXT: .save {r7, lr}
357 ; CHECK-NEXT: push {r7, lr}
358 ; CHECK-NEXT: bic r3, r3, #3
359 ; CHECK-NEXT: rsbs r2, r2, #0
360 ; CHECK-NEXT: sub.w r12, r3, #4
361 ; CHECK-NEXT: movs r3, #1
362 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
363 ; CHECK-NEXT: .LBB8_1: @ %vector.body
364 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
365 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
366 ; CHECK-NEXT: vshl.s8 q0, r2
367 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
368 ; CHECK-NEXT: le lr, .LBB8_1
369 ; CHECK-NEXT: @ %bb.2: @ %exit
370 ; CHECK-NEXT: pop {r7, pc}
375 %n.vec = and i32 %N, -4
376 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
377 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
378 br label %vector.body
380 vector.body: ; preds = %vector.body, %vector.ph
381 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
382 %gep.in = getelementptr inbounds i8, i8* %in, i32 %index
383 %cast.in = bitcast i8* %gep.in to <16 x i8>*
384 %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4
385 %res = ashr <16 x i8> %wide.load, %broadcast.splat11
386 %gep.out = getelementptr inbounds i8, i8* %out, i32 %index
387 %cast.out = bitcast i8* %gep.out to <16 x i8>*
388 store <16 x i8> %res, <16 x i8>* %cast.out, align 4
389 %index.next = add i32 %index, 4
390 %cmp = icmp eq i32 %index.next, %n.vec
391 br i1 %cmp, label %exit, label %vector.body