1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s --verify-machineinstrs -o - | FileCheck %s
4 define dso_local arm_aapcs_vfpcc void @sink_shl_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) {
5 ; CHECK-LABEL: sink_shl_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: bic r3, r3, #3
10 ; CHECK-NEXT: sub.w r12, r3, #4
11 ; CHECK-NEXT: movs r3, #1
12 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
13 ; CHECK-NEXT: .LBB0_1: @ %vector.body
14 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
16 ; CHECK-NEXT: vshl.u32 q0, r2
17 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
18 ; CHECK-NEXT: le lr, .LBB0_1
19 ; CHECK-NEXT: @ %bb.2: @ %exit
20 ; CHECK-NEXT: pop {r7, pc}
25 %n.vec = and i32 %N, -4
26 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
27 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
30 vector.body: ; preds = %vector.body, %vector.ph
31 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
32 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index
33 %wide.load = load <4 x i32>, ptr %gep.in, align 4
34 %res = shl <4 x i32> %wide.load, %broadcast.splat11
35 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index
36 store <4 x i32> %res, ptr %gep.out, align 4
37 %index.next = add i32 %index, 4
38 %cmp = icmp eq i32 %index.next, %n.vec
39 br i1 %cmp, label %exit, label %vector.body
45 define dso_local arm_aapcs_vfpcc void @sink_shl_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) {
46 ; CHECK-LABEL: sink_shl_i16:
47 ; CHECK: @ %bb.0: @ %entry
48 ; CHECK-NEXT: .save {r7, lr}
49 ; CHECK-NEXT: push {r7, lr}
50 ; CHECK-NEXT: bic r3, r3, #3
51 ; CHECK-NEXT: sub.w r12, r3, #4
52 ; CHECK-NEXT: movs r3, #1
53 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
54 ; CHECK-NEXT: .LBB1_1: @ %vector.body
55 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
56 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
57 ; CHECK-NEXT: vshl.u16 q0, r2
58 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
59 ; CHECK-NEXT: le lr, .LBB1_1
60 ; CHECK-NEXT: @ %bb.2: @ %exit
61 ; CHECK-NEXT: pop {r7, pc}
66 %n.vec = and i32 %N, -4
67 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
68 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
71 vector.body: ; preds = %vector.body, %vector.ph
72 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
73 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index
74 %wide.load = load <8 x i16>, ptr %gep.in, align 4
75 %res = shl <8 x i16> %wide.load, %broadcast.splat11
76 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index
77 store <8 x i16> %res, ptr %gep.out, align 4
78 %index.next = add i32 %index, 4
79 %cmp = icmp eq i32 %index.next, %n.vec
80 br i1 %cmp, label %exit, label %vector.body
86 define dso_local arm_aapcs_vfpcc void @sink_shl_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) {
87 ; CHECK-LABEL: sink_shl_i8:
88 ; CHECK: @ %bb.0: @ %entry
89 ; CHECK-NEXT: .save {r7, lr}
90 ; CHECK-NEXT: push {r7, lr}
91 ; CHECK-NEXT: bic r3, r3, #3
92 ; CHECK-NEXT: sub.w r12, r3, #4
93 ; CHECK-NEXT: movs r3, #1
94 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
95 ; CHECK-NEXT: .LBB2_1: @ %vector.body
96 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
97 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
98 ; CHECK-NEXT: vshl.u8 q0, r2
99 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
100 ; CHECK-NEXT: le lr, .LBB2_1
101 ; CHECK-NEXT: @ %bb.2: @ %exit
102 ; CHECK-NEXT: pop {r7, pc}
107 %n.vec = and i32 %N, -4
108 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
109 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
110 br label %vector.body
112 vector.body: ; preds = %vector.body, %vector.ph
113 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
114 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index
115 %wide.load = load <16 x i8>, ptr %gep.in, align 4
116 %res = shl <16 x i8> %wide.load, %broadcast.splat11
117 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index
118 store <16 x i8> %res, ptr %gep.out, align 4
119 %index.next = add i32 %index, 4
120 %cmp = icmp eq i32 %index.next, %n.vec
121 br i1 %cmp, label %exit, label %vector.body
127 define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) {
128 ; CHECK-LABEL: sink_lshr_i32:
129 ; CHECK: @ %bb.0: @ %entry
130 ; CHECK-NEXT: .save {r7, lr}
131 ; CHECK-NEXT: push {r7, lr}
132 ; CHECK-NEXT: bic r3, r3, #3
133 ; CHECK-NEXT: rsbs r2, r2, #0
134 ; CHECK-NEXT: sub.w r12, r3, #4
135 ; CHECK-NEXT: movs r3, #1
136 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
137 ; CHECK-NEXT: .LBB3_1: @ %vector.body
138 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
139 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
140 ; CHECK-NEXT: vshl.u32 q0, r2
141 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
142 ; CHECK-NEXT: le lr, .LBB3_1
143 ; CHECK-NEXT: @ %bb.2: @ %exit
144 ; CHECK-NEXT: pop {r7, pc}
149 %n.vec = and i32 %N, -4
150 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
151 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
152 br label %vector.body
154 vector.body: ; preds = %vector.body, %vector.ph
155 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
156 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index
157 %wide.load = load <4 x i32>, ptr %gep.in, align 4
158 %res = lshr <4 x i32> %wide.load, %broadcast.splat11
159 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index
160 store <4 x i32> %res, ptr %gep.out, align 4
161 %index.next = add i32 %index, 4
162 %cmp = icmp eq i32 %index.next, %n.vec
163 br i1 %cmp, label %exit, label %vector.body
169 define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) {
170 ; CHECK-LABEL: sink_lshr_i16:
171 ; CHECK: @ %bb.0: @ %entry
172 ; CHECK-NEXT: .save {r7, lr}
173 ; CHECK-NEXT: push {r7, lr}
174 ; CHECK-NEXT: bic r3, r3, #3
175 ; CHECK-NEXT: rsbs r2, r2, #0
176 ; CHECK-NEXT: sub.w r12, r3, #4
177 ; CHECK-NEXT: movs r3, #1
178 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
179 ; CHECK-NEXT: .LBB4_1: @ %vector.body
180 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
181 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
182 ; CHECK-NEXT: vshl.u16 q0, r2
183 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
184 ; CHECK-NEXT: le lr, .LBB4_1
185 ; CHECK-NEXT: @ %bb.2: @ %exit
186 ; CHECK-NEXT: pop {r7, pc}
191 %n.vec = and i32 %N, -4
192 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
193 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
194 br label %vector.body
196 vector.body: ; preds = %vector.body, %vector.ph
197 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
198 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index
199 %wide.load = load <8 x i16>, ptr %gep.in, align 4
200 %res = lshr <8 x i16> %wide.load, %broadcast.splat11
201 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index
202 store <8 x i16> %res, ptr %gep.out, align 4
203 %index.next = add i32 %index, 4
204 %cmp = icmp eq i32 %index.next, %n.vec
205 br i1 %cmp, label %exit, label %vector.body
211 define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) {
212 ; CHECK-LABEL: sink_lshr_i8:
213 ; CHECK: @ %bb.0: @ %entry
214 ; CHECK-NEXT: .save {r7, lr}
215 ; CHECK-NEXT: push {r7, lr}
216 ; CHECK-NEXT: bic r3, r3, #3
217 ; CHECK-NEXT: rsbs r2, r2, #0
218 ; CHECK-NEXT: sub.w r12, r3, #4
219 ; CHECK-NEXT: movs r3, #1
220 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
221 ; CHECK-NEXT: .LBB5_1: @ %vector.body
222 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
223 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
224 ; CHECK-NEXT: vshl.u8 q0, r2
225 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
226 ; CHECK-NEXT: le lr, .LBB5_1
227 ; CHECK-NEXT: @ %bb.2: @ %exit
228 ; CHECK-NEXT: pop {r7, pc}
233 %n.vec = and i32 %N, -4
234 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
235 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
236 br label %vector.body
238 vector.body: ; preds = %vector.body, %vector.ph
239 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
240 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index
241 %wide.load = load <16 x i8>, ptr %gep.in, align 4
242 %res = lshr <16 x i8> %wide.load, %broadcast.splat11
243 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index
244 store <16 x i8> %res, ptr %gep.out, align 4
245 %index.next = add i32 %index, 4
246 %cmp = icmp eq i32 %index.next, %n.vec
247 br i1 %cmp, label %exit, label %vector.body
253 define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) {
254 ; CHECK-LABEL: sink_ashr_i32:
255 ; CHECK: @ %bb.0: @ %entry
256 ; CHECK-NEXT: .save {r7, lr}
257 ; CHECK-NEXT: push {r7, lr}
258 ; CHECK-NEXT: bic r3, r3, #3
259 ; CHECK-NEXT: rsbs r2, r2, #0
260 ; CHECK-NEXT: sub.w r12, r3, #4
261 ; CHECK-NEXT: movs r3, #1
262 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
263 ; CHECK-NEXT: .LBB6_1: @ %vector.body
264 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
265 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
266 ; CHECK-NEXT: vshl.s32 q0, r2
267 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
268 ; CHECK-NEXT: le lr, .LBB6_1
269 ; CHECK-NEXT: @ %bb.2: @ %exit
270 ; CHECK-NEXT: pop {r7, pc}
275 %n.vec = and i32 %N, -4
276 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0
277 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
278 br label %vector.body
280 vector.body: ; preds = %vector.body, %vector.ph
281 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
282 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index
283 %wide.load = load <4 x i32>, ptr %gep.in, align 4
284 %res = ashr <4 x i32> %wide.load, %broadcast.splat11
285 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index
286 store <4 x i32> %res, ptr %gep.out, align 4
287 %index.next = add i32 %index, 4
288 %cmp = icmp eq i32 %index.next, %n.vec
289 br i1 %cmp, label %exit, label %vector.body
295 define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) {
296 ; CHECK-LABEL: sink_ashr_i16:
297 ; CHECK: @ %bb.0: @ %entry
298 ; CHECK-NEXT: .save {r7, lr}
299 ; CHECK-NEXT: push {r7, lr}
300 ; CHECK-NEXT: bic r3, r3, #3
301 ; CHECK-NEXT: rsbs r2, r2, #0
302 ; CHECK-NEXT: sub.w r12, r3, #4
303 ; CHECK-NEXT: movs r3, #1
304 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
305 ; CHECK-NEXT: .LBB7_1: @ %vector.body
306 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
307 ; CHECK-NEXT: vldrw.u32 q0, [r0], #8
308 ; CHECK-NEXT: vshl.s16 q0, r2
309 ; CHECK-NEXT: vstrb.8 q0, [r1], #8
310 ; CHECK-NEXT: le lr, .LBB7_1
311 ; CHECK-NEXT: @ %bb.2: @ %exit
312 ; CHECK-NEXT: pop {r7, pc}
317 %n.vec = and i32 %N, -4
318 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0
319 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer
320 br label %vector.body
322 vector.body: ; preds = %vector.body, %vector.ph
323 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
324 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index
325 %wide.load = load <8 x i16>, ptr %gep.in, align 4
326 %res = ashr <8 x i16> %wide.load, %broadcast.splat11
327 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index
328 store <8 x i16> %res, ptr %gep.out, align 4
329 %index.next = add i32 %index, 4
330 %cmp = icmp eq i32 %index.next, %n.vec
331 br i1 %cmp, label %exit, label %vector.body
337 define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) {
338 ; CHECK-LABEL: sink_ashr_i8:
339 ; CHECK: @ %bb.0: @ %entry
340 ; CHECK-NEXT: .save {r7, lr}
341 ; CHECK-NEXT: push {r7, lr}
342 ; CHECK-NEXT: bic r3, r3, #3
343 ; CHECK-NEXT: rsbs r2, r2, #0
344 ; CHECK-NEXT: sub.w r12, r3, #4
345 ; CHECK-NEXT: movs r3, #1
346 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
347 ; CHECK-NEXT: .LBB8_1: @ %vector.body
348 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
349 ; CHECK-NEXT: vldrw.u32 q0, [r0], #4
350 ; CHECK-NEXT: vshl.s8 q0, r2
351 ; CHECK-NEXT: vstrb.8 q0, [r1], #4
352 ; CHECK-NEXT: le lr, .LBB8_1
353 ; CHECK-NEXT: @ %bb.2: @ %exit
354 ; CHECK-NEXT: pop {r7, pc}
359 %n.vec = and i32 %N, -4
360 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0
361 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer
362 br label %vector.body
364 vector.body: ; preds = %vector.body, %vector.ph
365 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
366 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index
367 %wide.load = load <16 x i8>, ptr %gep.in, align 4
368 %res = ashr <16 x i8> %wide.load, %broadcast.splat11
369 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index
370 store <16 x i8> %res, ptr %gep.out, align 4
371 %index.next = add i32 %index, 4
372 %cmp = icmp eq i32 %index.next, %n.vec
373 br i1 %cmp, label %exit, label %vector.body