1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr) {
5 ; CHECK-LABEL: unscaled_v16i8_i8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vldrb.u8 q1, [r1]
8 ; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
11 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
12 %offs.zext = zext <16 x i8> %offs to <16 x i32>
13 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
14 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
18 define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) {
19 ; CHECK-LABEL: unscaled_v8i8_i8:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vldrb.u16 q1, [r1]
22 ; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
25 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
26 %offs.zext = zext <8 x i8> %offs to <8 x i32>
27 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
28 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
32 define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr) {
33 ; CHECK-LABEL: unscaled_v2i8_i8:
34 ; CHECK: @ %bb.0: @ %entry
35 ; CHECK-NEXT: ldrb r2, [r1]
36 ; CHECK-NEXT: vmov.i32 q0, #0xff
37 ; CHECK-NEXT: ldrb r1, [r1, #1]
38 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
39 ; CHECK-NEXT: vand q0, q1, q0
40 ; CHECK-NEXT: vmov r1, s2
41 ; CHECK-NEXT: vmov r2, s0
42 ; CHECK-NEXT: ldrb r1, [r0, r1]
43 ; CHECK-NEXT: ldrb r0, [r0, r2]
44 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
47 %offs = load <2 x i8>, <2 x i8>* %offptr, align 1
48 %offs.zext = zext <2 x i8> %offs to <2 x i32>
49 %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
50 %gather = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> undef)
54 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) {
55 ; CHECK-LABEL: unscaled_v16i8_sext:
56 ; CHECK: @ %bb.0: @ %entry
57 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
58 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
59 ; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
60 ; CHECK-NEXT: vadd.i32 q0, q0, r0
61 ; CHECK-NEXT: vmov r2, r3, d1
62 ; CHECK-NEXT: vmov r4, r5, d0
63 ; CHECK-NEXT: vldrb.s32 q0, [r1]
64 ; CHECK-NEXT: vadd.i32 q2, q0, r0
65 ; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
66 ; CHECK-NEXT: vadd.i32 q1, q0, r0
67 ; CHECK-NEXT: ldrb.w r12, [r2]
68 ; CHECK-NEXT: ldrb.w lr, [r3]
69 ; CHECK-NEXT: ldrb r3, [r4]
70 ; CHECK-NEXT: ldrb r2, [r5]
71 ; CHECK-NEXT: vmov r4, r5, d4
72 ; CHECK-NEXT: ldrb r4, [r4]
73 ; CHECK-NEXT: ldrb r5, [r5]
74 ; CHECK-NEXT: vmov.8 q0[0], r4
75 ; CHECK-NEXT: vmov r4, r6, d3
76 ; CHECK-NEXT: vmov.8 q0[1], r5
77 ; CHECK-NEXT: ldrb r5, [r4]
78 ; CHECK-NEXT: ldrb r4, [r6]
79 ; CHECK-NEXT: vmov r6, r7, d5
80 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
81 ; CHECK-NEXT: vadd.i32 q2, q2, r0
82 ; CHECK-NEXT: ldrb r0, [r6]
83 ; CHECK-NEXT: ldrb r7, [r7]
84 ; CHECK-NEXT: vmov.8 q0[2], r0
85 ; CHECK-NEXT: vmov r0, r1, d4
86 ; CHECK-NEXT: vmov.8 q0[3], r7
87 ; CHECK-NEXT: ldrb r0, [r0]
88 ; CHECK-NEXT: ldrb r1, [r1]
89 ; CHECK-NEXT: vmov.8 q0[4], r0
90 ; CHECK-NEXT: vmov.8 q0[5], r1
91 ; CHECK-NEXT: vmov r0, r1, d5
92 ; CHECK-NEXT: ldrb r0, [r0]
93 ; CHECK-NEXT: ldrb r1, [r1]
94 ; CHECK-NEXT: vmov.8 q0[6], r0
95 ; CHECK-NEXT: vmov.8 q0[7], r1
96 ; CHECK-NEXT: vmov r0, r1, d2
97 ; CHECK-NEXT: ldrb r0, [r0]
98 ; CHECK-NEXT: ldrb r1, [r1]
99 ; CHECK-NEXT: vmov.8 q0[8], r0
100 ; CHECK-NEXT: vmov.8 q0[9], r1
101 ; CHECK-NEXT: vmov.8 q0[10], r5
102 ; CHECK-NEXT: vmov.8 q0[11], r4
103 ; CHECK-NEXT: vmov.8 q0[12], r3
104 ; CHECK-NEXT: vmov.8 q0[13], r2
105 ; CHECK-NEXT: vmov.8 q0[14], r12
106 ; CHECK-NEXT: vmov.8 q0[15], lr
107 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
109 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
110 %offs.sext = sext <16 x i8> %offs to <16 x i32>
111 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
112 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
113 ret <16 x i8> %gather
116 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) {
117 ; CHECK-LABEL: unscaled_v16i8_i16:
118 ; CHECK: @ %bb.0: @ %entry
119 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
120 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
121 ; CHECK-NEXT: vldrh.s32 q0, [r1, #24]
122 ; CHECK-NEXT: vadd.i32 q0, q0, r0
123 ; CHECK-NEXT: vmov r2, r3, d1
124 ; CHECK-NEXT: vmov r4, r5, d0
125 ; CHECK-NEXT: vldrh.s32 q0, [r1]
126 ; CHECK-NEXT: vadd.i32 q2, q0, r0
127 ; CHECK-NEXT: vldrh.s32 q0, [r1, #16]
128 ; CHECK-NEXT: vadd.i32 q1, q0, r0
129 ; CHECK-NEXT: ldrb.w r12, [r2]
130 ; CHECK-NEXT: ldrb.w lr, [r3]
131 ; CHECK-NEXT: ldrb r3, [r4]
132 ; CHECK-NEXT: ldrb r2, [r5]
133 ; CHECK-NEXT: vmov r4, r5, d4
134 ; CHECK-NEXT: ldrb r4, [r4]
135 ; CHECK-NEXT: ldrb r5, [r5]
136 ; CHECK-NEXT: vmov.8 q0[0], r4
137 ; CHECK-NEXT: vmov r4, r6, d3
138 ; CHECK-NEXT: vmov.8 q0[1], r5
139 ; CHECK-NEXT: ldrb r5, [r4]
140 ; CHECK-NEXT: ldrb r4, [r6]
141 ; CHECK-NEXT: vmov r6, r7, d5
142 ; CHECK-NEXT: vldrh.s32 q2, [r1, #8]
143 ; CHECK-NEXT: vadd.i32 q2, q2, r0
144 ; CHECK-NEXT: ldrb r0, [r6]
145 ; CHECK-NEXT: ldrb r7, [r7]
146 ; CHECK-NEXT: vmov.8 q0[2], r0
147 ; CHECK-NEXT: vmov r0, r1, d4
148 ; CHECK-NEXT: vmov.8 q0[3], r7
149 ; CHECK-NEXT: ldrb r0, [r0]
150 ; CHECK-NEXT: ldrb r1, [r1]
151 ; CHECK-NEXT: vmov.8 q0[4], r0
152 ; CHECK-NEXT: vmov.8 q0[5], r1
153 ; CHECK-NEXT: vmov r0, r1, d5
154 ; CHECK-NEXT: ldrb r0, [r0]
155 ; CHECK-NEXT: ldrb r1, [r1]
156 ; CHECK-NEXT: vmov.8 q0[6], r0
157 ; CHECK-NEXT: vmov.8 q0[7], r1
158 ; CHECK-NEXT: vmov r0, r1, d2
159 ; CHECK-NEXT: ldrb r0, [r0]
160 ; CHECK-NEXT: ldrb r1, [r1]
161 ; CHECK-NEXT: vmov.8 q0[8], r0
162 ; CHECK-NEXT: vmov.8 q0[9], r1
163 ; CHECK-NEXT: vmov.8 q0[10], r5
164 ; CHECK-NEXT: vmov.8 q0[11], r4
165 ; CHECK-NEXT: vmov.8 q0[12], r3
166 ; CHECK-NEXT: vmov.8 q0[13], r2
167 ; CHECK-NEXT: vmov.8 q0[14], r12
168 ; CHECK-NEXT: vmov.8 q0[15], lr
169 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
171 %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
172 %offs.sext = sext <16 x i16> %offs to <16 x i32>
173 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
174 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
175 ret <16 x i8> %gather
178 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) {
179 ; CHECK-LABEL: unscaled_v16i8_scaled:
180 ; CHECK: @ %bb.0: @ %entry
181 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
182 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
183 ; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
184 ; CHECK-NEXT: vshl.i32 q0, q0, #2
185 ; CHECK-NEXT: vadd.i32 q0, q0, r0
186 ; CHECK-NEXT: vmov r2, r3, d1
187 ; CHECK-NEXT: vmov r4, r5, d0
188 ; CHECK-NEXT: vldrb.u32 q0, [r1]
189 ; CHECK-NEXT: vshl.i32 q0, q0, #2
190 ; CHECK-NEXT: vadd.i32 q2, q0, r0
191 ; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
192 ; CHECK-NEXT: vshl.i32 q0, q0, #2
193 ; CHECK-NEXT: vadd.i32 q1, q0, r0
194 ; CHECK-NEXT: ldrb.w r12, [r2]
195 ; CHECK-NEXT: ldrb.w lr, [r3]
196 ; CHECK-NEXT: ldrb r3, [r4]
197 ; CHECK-NEXT: ldrb r2, [r5]
198 ; CHECK-NEXT: vmov r4, r5, d4
199 ; CHECK-NEXT: ldrb r4, [r4]
200 ; CHECK-NEXT: ldrb r5, [r5]
201 ; CHECK-NEXT: vmov.8 q0[0], r4
202 ; CHECK-NEXT: vmov r4, r6, d3
203 ; CHECK-NEXT: vmov.8 q0[1], r5
204 ; CHECK-NEXT: ldrb r5, [r4]
205 ; CHECK-NEXT: ldrb r4, [r6]
206 ; CHECK-NEXT: vmov r6, r7, d5
207 ; CHECK-NEXT: vldrb.u32 q2, [r1, #4]
208 ; CHECK-NEXT: vshl.i32 q2, q2, #2
209 ; CHECK-NEXT: vadd.i32 q2, q2, r0
210 ; CHECK-NEXT: ldrb r0, [r6]
211 ; CHECK-NEXT: ldrb r7, [r7]
212 ; CHECK-NEXT: vmov.8 q0[2], r0
213 ; CHECK-NEXT: vmov r0, r1, d4
214 ; CHECK-NEXT: vmov.8 q0[3], r7
215 ; CHECK-NEXT: ldrb r0, [r0]
216 ; CHECK-NEXT: ldrb r1, [r1]
217 ; CHECK-NEXT: vmov.8 q0[4], r0
218 ; CHECK-NEXT: vmov.8 q0[5], r1
219 ; CHECK-NEXT: vmov r0, r1, d5
220 ; CHECK-NEXT: ldrb r0, [r0]
221 ; CHECK-NEXT: ldrb r1, [r1]
222 ; CHECK-NEXT: vmov.8 q0[6], r0
223 ; CHECK-NEXT: vmov.8 q0[7], r1
224 ; CHECK-NEXT: vmov r0, r1, d2
225 ; CHECK-NEXT: ldrb r0, [r0]
226 ; CHECK-NEXT: ldrb r1, [r1]
227 ; CHECK-NEXT: vmov.8 q0[8], r0
228 ; CHECK-NEXT: vmov.8 q0[9], r1
229 ; CHECK-NEXT: vmov.8 q0[10], r5
230 ; CHECK-NEXT: vmov.8 q0[11], r4
231 ; CHECK-NEXT: vmov.8 q0[12], r3
232 ; CHECK-NEXT: vmov.8 q0[13], r2
233 ; CHECK-NEXT: vmov.8 q0[14], r12
234 ; CHECK-NEXT: vmov.8 q0[15], lr
235 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
237 %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
238 %offs.zext = zext <16 x i8> %offs to <16 x i32>
239 %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
240 %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
241 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
242 ret <16 x i8> %gather
245 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) {
246 ; CHECK-LABEL: unscaled_v16i8_i8_next:
247 ; CHECK: @ %bb.0: @ %entry
248 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
249 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
250 ; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
251 ; CHECK-NEXT: vadd.i32 q0, q0, r0
252 ; CHECK-NEXT: vmov r2, r3, d1
253 ; CHECK-NEXT: vmov r4, r5, d0
254 ; CHECK-NEXT: vldrw.u32 q0, [r1]
255 ; CHECK-NEXT: vadd.i32 q2, q0, r0
256 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
257 ; CHECK-NEXT: vadd.i32 q1, q0, r0
258 ; CHECK-NEXT: ldrb.w r12, [r2]
259 ; CHECK-NEXT: ldrb.w lr, [r3]
260 ; CHECK-NEXT: ldrb r3, [r4]
261 ; CHECK-NEXT: ldrb r2, [r5]
262 ; CHECK-NEXT: vmov r4, r5, d4
263 ; CHECK-NEXT: ldrb r4, [r4]
264 ; CHECK-NEXT: ldrb r5, [r5]
265 ; CHECK-NEXT: vmov.8 q0[0], r4
266 ; CHECK-NEXT: vmov r4, r6, d3
267 ; CHECK-NEXT: vmov.8 q0[1], r5
268 ; CHECK-NEXT: ldrb r5, [r4]
269 ; CHECK-NEXT: ldrb r4, [r6]
270 ; CHECK-NEXT: vmov r6, r7, d5
271 ; CHECK-NEXT: vldrw.u32 q2, [r1, #16]
272 ; CHECK-NEXT: vadd.i32 q2, q2, r0
273 ; CHECK-NEXT: ldrb r0, [r6]
274 ; CHECK-NEXT: ldrb r7, [r7]
275 ; CHECK-NEXT: vmov.8 q0[2], r0
276 ; CHECK-NEXT: vmov r0, r1, d4
277 ; CHECK-NEXT: vmov.8 q0[3], r7
278 ; CHECK-NEXT: ldrb r0, [r0]
279 ; CHECK-NEXT: ldrb r1, [r1]
280 ; CHECK-NEXT: vmov.8 q0[4], r0
281 ; CHECK-NEXT: vmov.8 q0[5], r1
282 ; CHECK-NEXT: vmov r0, r1, d5
283 ; CHECK-NEXT: ldrb r0, [r0]
284 ; CHECK-NEXT: ldrb r1, [r1]
285 ; CHECK-NEXT: vmov.8 q0[6], r0
286 ; CHECK-NEXT: vmov.8 q0[7], r1
287 ; CHECK-NEXT: vmov r0, r1, d2
288 ; CHECK-NEXT: ldrb r0, [r0]
289 ; CHECK-NEXT: ldrb r1, [r1]
290 ; CHECK-NEXT: vmov.8 q0[8], r0
291 ; CHECK-NEXT: vmov.8 q0[9], r1
292 ; CHECK-NEXT: vmov.8 q0[10], r5
293 ; CHECK-NEXT: vmov.8 q0[11], r4
294 ; CHECK-NEXT: vmov.8 q0[12], r3
295 ; CHECK-NEXT: vmov.8 q0[13], r2
296 ; CHECK-NEXT: vmov.8 q0[14], r12
297 ; CHECK-NEXT: vmov.8 q0[15], lr
298 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
300 %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
301 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
302 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
303 ret <16 x i8> %gather
306 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) {
307 ; CHECK-LABEL: unscaled_v16i8_i8_2gep:
308 ; CHECK: @ %bb.0: @ %entry
309 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
310 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
311 ; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
312 ; CHECK-NEXT: vmov.i32 q2, #0x5
313 ; CHECK-NEXT: vadd.i32 q0, q0, r0
314 ; CHECK-NEXT: vadd.i32 q0, q0, q2
315 ; CHECK-NEXT: vmov r2, r3, d1
316 ; CHECK-NEXT: vmov r4, r5, d0
317 ; CHECK-NEXT: vldrb.s32 q0, [r1]
318 ; CHECK-NEXT: vadd.i32 q0, q0, r0
319 ; CHECK-NEXT: vadd.i32 q3, q0, q2
320 ; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
321 ; CHECK-NEXT: vadd.i32 q0, q0, r0
322 ; CHECK-NEXT: vadd.i32 q1, q0, q2
323 ; CHECK-NEXT: ldrb.w r12, [r2]
324 ; CHECK-NEXT: ldrb.w lr, [r3]
325 ; CHECK-NEXT: ldrb r3, [r4]
326 ; CHECK-NEXT: ldrb r2, [r5]
327 ; CHECK-NEXT: vmov r4, r5, d6
328 ; CHECK-NEXT: ldrb r4, [r4]
329 ; CHECK-NEXT: ldrb r5, [r5]
330 ; CHECK-NEXT: vmov.8 q0[0], r4
331 ; CHECK-NEXT: vmov r4, r6, d3
332 ; CHECK-NEXT: vmov.8 q0[1], r5
333 ; CHECK-NEXT: ldrb r5, [r4]
334 ; CHECK-NEXT: ldrb r4, [r6]
335 ; CHECK-NEXT: vmov r6, r7, d7
336 ; CHECK-NEXT: vldrb.s32 q3, [r1, #4]
337 ; CHECK-NEXT: vadd.i32 q3, q3, r0
338 ; CHECK-NEXT: vadd.i32 q2, q3, q2
339 ; CHECK-NEXT: ldrb r0, [r6]
340 ; CHECK-NEXT: ldrb r7, [r7]
341 ; CHECK-NEXT: vmov.8 q0[2], r0
342 ; CHECK-NEXT: vmov r0, r1, d4
343 ; CHECK-NEXT: vmov.8 q0[3], r7
344 ; CHECK-NEXT: ldrb r0, [r0]
345 ; CHECK-NEXT: ldrb r1, [r1]
346 ; CHECK-NEXT: vmov.8 q0[4], r0
347 ; CHECK-NEXT: vmov.8 q0[5], r1
348 ; CHECK-NEXT: vmov r0, r1, d5
349 ; CHECK-NEXT: ldrb r0, [r0]
350 ; CHECK-NEXT: ldrb r1, [r1]
351 ; CHECK-NEXT: vmov.8 q0[6], r0
352 ; CHECK-NEXT: vmov.8 q0[7], r1
353 ; CHECK-NEXT: vmov r0, r1, d2
354 ; CHECK-NEXT: ldrb r0, [r0]
355 ; CHECK-NEXT: ldrb r1, [r1]
356 ; CHECK-NEXT: vmov.8 q0[8], r0
357 ; CHECK-NEXT: vmov.8 q0[9], r1
358 ; CHECK-NEXT: vmov.8 q0[10], r5
359 ; CHECK-NEXT: vmov.8 q0[11], r4
360 ; CHECK-NEXT: vmov.8 q0[12], r3
361 ; CHECK-NEXT: vmov.8 q0[13], r2
362 ; CHECK-NEXT: vmov.8 q0[14], r12
363 ; CHECK-NEXT: vmov.8 q0[15], lr
364 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
366 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
367 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
368 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
369 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
370 ret <16 x i8> %gather
374 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) {
375 ; CHECK-LABEL: unscaled_v16i8_i8_2gep2:
376 ; CHECK: @ %bb.0: @ %entry
377 ; CHECK-NEXT: adr r1, .LCPI8_0
378 ; CHECK-NEXT: vldrw.u32 q1, [r1]
379 ; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
381 ; CHECK-NEXT: .p2align 4
382 ; CHECK-NEXT: @ %bb.1:
383 ; CHECK-NEXT: .LCPI8_0:
384 ; CHECK-NEXT: .byte 5 @ 0x5
385 ; CHECK-NEXT: .byte 8 @ 0x8
386 ; CHECK-NEXT: .byte 11 @ 0xb
387 ; CHECK-NEXT: .byte 14 @ 0xe
388 ; CHECK-NEXT: .byte 17 @ 0x11
389 ; CHECK-NEXT: .byte 20 @ 0x14
390 ; CHECK-NEXT: .byte 23 @ 0x17
391 ; CHECK-NEXT: .byte 26 @ 0x1a
392 ; CHECK-NEXT: .byte 29 @ 0x1d
393 ; CHECK-NEXT: .byte 32 @ 0x20
394 ; CHECK-NEXT: .byte 35 @ 0x23
395 ; CHECK-NEXT: .byte 38 @ 0x26
396 ; CHECK-NEXT: .byte 41 @ 0x29
397 ; CHECK-NEXT: .byte 44 @ 0x2c
398 ; CHECK-NEXT: .byte 47 @ 0x2f
399 ; CHECK-NEXT: .byte 50 @ 0x32
401 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
402 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
403 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
404 ret <16 x i8> %gather
408 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) {
409 ; CHECK-LABEL: unscaled_v16i8_i8_biggep:
410 ; CHECK: @ %bb.0: @ %entry
411 ; CHECK-NEXT: adr r1, .LCPI9_0
412 ; CHECK-NEXT: vldrw.u32 q1, [r1]
413 ; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
415 ; CHECK-NEXT: .p2align 4
416 ; CHECK-NEXT: @ %bb.1:
417 ; CHECK-NEXT: .LCPI9_0:
418 ; CHECK-NEXT: .byte 5 @ 0x5
419 ; CHECK-NEXT: .byte 8 @ 0x8
420 ; CHECK-NEXT: .byte 11 @ 0xb
421 ; CHECK-NEXT: .byte 14 @ 0xe
422 ; CHECK-NEXT: .byte 17 @ 0x11
423 ; CHECK-NEXT: .byte 20 @ 0x14
424 ; CHECK-NEXT: .byte 23 @ 0x17
425 ; CHECK-NEXT: .byte 26 @ 0x1a
426 ; CHECK-NEXT: .byte 29 @ 0x1d
427 ; CHECK-NEXT: .byte 32 @ 0x20
428 ; CHECK-NEXT: .byte 35 @ 0x23
429 ; CHECK-NEXT: .byte 38 @ 0x26
430 ; CHECK-NEXT: .byte 41 @ 0x29
431 ; CHECK-NEXT: .byte 44 @ 0x2c
432 ; CHECK-NEXT: .byte 47 @ 0x2f
433 ; CHECK-NEXT: .byte 50 @ 0x32
435 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
436 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5
437 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
438 ret <16 x i8> %gather
442 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) {
443 ; CHECK-LABEL: unscaled_v16i8_i8_biggep2:
444 ; CHECK: @ %bb.0: @ %entry
445 ; CHECK-NEXT: adr r1, .LCPI10_0
446 ; CHECK-NEXT: vldrw.u32 q1, [r1]
447 ; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
449 ; CHECK-NEXT: .p2align 4
450 ; CHECK-NEXT: @ %bb.1:
451 ; CHECK-NEXT: .LCPI10_0:
452 ; CHECK-NEXT: .byte 0 @ 0x0
453 ; CHECK-NEXT: .byte 3 @ 0x3
454 ; CHECK-NEXT: .byte 6 @ 0x6
455 ; CHECK-NEXT: .byte 9 @ 0x9
456 ; CHECK-NEXT: .byte 12 @ 0xc
457 ; CHECK-NEXT: .byte 15 @ 0xf
458 ; CHECK-NEXT: .byte 18 @ 0x12
459 ; CHECK-NEXT: .byte 21 @ 0x15
460 ; CHECK-NEXT: .byte 24 @ 0x18
461 ; CHECK-NEXT: .byte 27 @ 0x1b
462 ; CHECK-NEXT: .byte 30 @ 0x1e
463 ; CHECK-NEXT: .byte 33 @ 0x21
464 ; CHECK-NEXT: .byte 36 @ 0x24
465 ; CHECK-NEXT: .byte 39 @ 0x27
466 ; CHECK-NEXT: .byte 42 @ 0x2a
467 ; CHECK-NEXT: .byte 45 @ 0x2d
469 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
470 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
471 ret <16 x i8> %gather
475 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) {
476 ; CHECK-LABEL: unscaled_v16i8_i8_biggep3:
477 ; CHECK: @ %bb.0: @ %entry
478 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
479 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
480 ; CHECK-NEXT: adr r1, .LCPI11_0
481 ; CHECK-NEXT: adr r4, .LCPI11_1
482 ; CHECK-NEXT: vldrw.u32 q0, [r1]
483 ; CHECK-NEXT: adr r7, .LCPI11_3
484 ; CHECK-NEXT: vadd.i32 q0, q0, r0
485 ; CHECK-NEXT: vmov r1, r2, d1
486 ; CHECK-NEXT: vmov r3, r5, d0
487 ; CHECK-NEXT: vldrw.u32 q0, [r4]
488 ; CHECK-NEXT: vadd.i32 q1, q0, r0
489 ; CHECK-NEXT: vmov r4, r6, d3
490 ; CHECK-NEXT: ldrb.w lr, [r1]
491 ; CHECK-NEXT: adr r1, .LCPI11_2
492 ; CHECK-NEXT: vldrw.u32 q0, [r1]
493 ; CHECK-NEXT: ldrb.w r12, [r2]
494 ; CHECK-NEXT: ldrb r1, [r5]
495 ; CHECK-NEXT: vadd.i32 q2, q0, r0
496 ; CHECK-NEXT: ldrb r3, [r3]
497 ; CHECK-NEXT: ldrb r2, [r6]
498 ; CHECK-NEXT: vmov r5, r6, d4
499 ; CHECK-NEXT: ldrb r4, [r4]
500 ; CHECK-NEXT: ldrb r5, [r5]
501 ; CHECK-NEXT: vmov.8 q0[0], r5
502 ; CHECK-NEXT: ldrb r5, [r6]
503 ; CHECK-NEXT: vmov.8 q0[1], r5
504 ; CHECK-NEXT: vmov r5, r6, d5
505 ; CHECK-NEXT: vldrw.u32 q2, [r7]
506 ; CHECK-NEXT: vadd.i32 q2, q2, r0
507 ; CHECK-NEXT: ldrb r0, [r5]
508 ; CHECK-NEXT: ldrb r6, [r6]
509 ; CHECK-NEXT: vmov.8 q0[2], r0
510 ; CHECK-NEXT: vmov r0, r5, d4
511 ; CHECK-NEXT: vmov.8 q0[3], r6
512 ; CHECK-NEXT: ldrb r0, [r0]
513 ; CHECK-NEXT: ldrb r5, [r5]
514 ; CHECK-NEXT: vmov.8 q0[4], r0
515 ; CHECK-NEXT: vmov.8 q0[5], r5
516 ; CHECK-NEXT: vmov r0, r5, d5
517 ; CHECK-NEXT: ldrb r0, [r0]
518 ; CHECK-NEXT: ldrb r5, [r5]
519 ; CHECK-NEXT: vmov.8 q0[6], r0
520 ; CHECK-NEXT: vmov.8 q0[7], r5
521 ; CHECK-NEXT: vmov r0, r5, d2
522 ; CHECK-NEXT: ldrb r0, [r0]
523 ; CHECK-NEXT: ldrb r5, [r5]
524 ; CHECK-NEXT: vmov.8 q0[8], r0
525 ; CHECK-NEXT: vmov.8 q0[9], r5
526 ; CHECK-NEXT: vmov.8 q0[10], r4
527 ; CHECK-NEXT: vmov.8 q0[11], r2
528 ; CHECK-NEXT: vmov.8 q0[12], r3
529 ; CHECK-NEXT: vmov.8 q0[13], r1
530 ; CHECK-NEXT: vmov.8 q0[14], lr
531 ; CHECK-NEXT: vmov.8 q0[15], r12
532 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
533 ; CHECK-NEXT: .p2align 4
534 ; CHECK-NEXT: @ %bb.1:
535 ; CHECK-NEXT: .LCPI11_0:
536 ; CHECK-NEXT: .long 292 @ 0x124
537 ; CHECK-NEXT: .long 295 @ 0x127
538 ; CHECK-NEXT: .long 298 @ 0x12a
539 ; CHECK-NEXT: .long 301 @ 0x12d
540 ; CHECK-NEXT: .LCPI11_1:
541 ; CHECK-NEXT: .long 280 @ 0x118
542 ; CHECK-NEXT: .long 283 @ 0x11b
543 ; CHECK-NEXT: .long 286 @ 0x11e
544 ; CHECK-NEXT: .long 289 @ 0x121
545 ; CHECK-NEXT: .LCPI11_2:
546 ; CHECK-NEXT: .long 256 @ 0x100
547 ; CHECK-NEXT: .long 259 @ 0x103
548 ; CHECK-NEXT: .long 262 @ 0x106
549 ; CHECK-NEXT: .long 265 @ 0x109
550 ; CHECK-NEXT: .LCPI11_3:
551 ; CHECK-NEXT: .long 268 @ 0x10c
552 ; CHECK-NEXT: .long 271 @ 0x10f
553 ; CHECK-NEXT: .long 274 @ 0x112
554 ; CHECK-NEXT: .long 277 @ 0x115
556 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
557 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 256
558 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
559 ret <16 x i8> %gather
563 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) {
564 ; CHECK-LABEL: unscaled_v16i8_i8_biggep4:
565 ; CHECK: @ %bb.0: @ %entry
566 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
567 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
568 ; CHECK-NEXT: adr r1, .LCPI12_0
569 ; CHECK-NEXT: adr r4, .LCPI12_1
570 ; CHECK-NEXT: vldrw.u32 q0, [r1]
571 ; CHECK-NEXT: adr r7, .LCPI12_3
572 ; CHECK-NEXT: vadd.i32 q0, q0, r0
573 ; CHECK-NEXT: vmov r1, r2, d1
574 ; CHECK-NEXT: vmov r3, r5, d0
575 ; CHECK-NEXT: vldrw.u32 q0, [r4]
576 ; CHECK-NEXT: vadd.i32 q1, q0, r0
577 ; CHECK-NEXT: vmov r4, r6, d3
578 ; CHECK-NEXT: ldrb.w lr, [r1]
579 ; CHECK-NEXT: adr r1, .LCPI12_2
580 ; CHECK-NEXT: vldrw.u32 q0, [r1]
581 ; CHECK-NEXT: ldrb.w r12, [r2]
582 ; CHECK-NEXT: ldrb r1, [r5]
583 ; CHECK-NEXT: vadd.i32 q2, q0, r0
584 ; CHECK-NEXT: ldrb r3, [r3]
585 ; CHECK-NEXT: ldrb r2, [r6]
586 ; CHECK-NEXT: vmov r5, r6, d4
587 ; CHECK-NEXT: ldrb r4, [r4]
588 ; CHECK-NEXT: ldrb r5, [r5]
589 ; CHECK-NEXT: vmov.8 q0[0], r5
590 ; CHECK-NEXT: ldrb r5, [r6]
591 ; CHECK-NEXT: vmov.8 q0[1], r5
592 ; CHECK-NEXT: vmov r5, r6, d5
593 ; CHECK-NEXT: vldrw.u32 q2, [r7]
594 ; CHECK-NEXT: vadd.i32 q2, q2, r0
595 ; CHECK-NEXT: ldrb r0, [r5]
596 ; CHECK-NEXT: ldrb r6, [r6]
597 ; CHECK-NEXT: vmov.8 q0[2], r0
598 ; CHECK-NEXT: vmov r0, r5, d4
599 ; CHECK-NEXT: vmov.8 q0[3], r6
600 ; CHECK-NEXT: ldrb r0, [r0]
601 ; CHECK-NEXT: ldrb r5, [r5]
602 ; CHECK-NEXT: vmov.8 q0[4], r0
603 ; CHECK-NEXT: vmov.8 q0[5], r5
604 ; CHECK-NEXT: vmov r0, r5, d5
605 ; CHECK-NEXT: ldrb r0, [r0]
606 ; CHECK-NEXT: ldrb r5, [r5]
607 ; CHECK-NEXT: vmov.8 q0[6], r0
608 ; CHECK-NEXT: vmov.8 q0[7], r5
609 ; CHECK-NEXT: vmov r0, r5, d2
610 ; CHECK-NEXT: ldrb r0, [r0]
611 ; CHECK-NEXT: ldrb r5, [r5]
612 ; CHECK-NEXT: vmov.8 q0[8], r0
613 ; CHECK-NEXT: vmov.8 q0[9], r5
614 ; CHECK-NEXT: vmov.8 q0[10], r4
615 ; CHECK-NEXT: vmov.8 q0[11], r2
616 ; CHECK-NEXT: vmov.8 q0[12], r3
617 ; CHECK-NEXT: vmov.8 q0[13], r1
618 ; CHECK-NEXT: vmov.8 q0[14], lr
619 ; CHECK-NEXT: vmov.8 q0[15], r12
620 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
621 ; CHECK-NEXT: .p2align 4
622 ; CHECK-NEXT: @ %bb.1:
623 ; CHECK-NEXT: .LCPI12_0:
624 ; CHECK-NEXT: .long 36 @ 0x24
625 ; CHECK-NEXT: .long 39 @ 0x27
626 ; CHECK-NEXT: .long 42 @ 0x2a
627 ; CHECK-NEXT: .long 45 @ 0x2d
628 ; CHECK-NEXT: .LCPI12_1:
629 ; CHECK-NEXT: .long 256 @ 0x100
630 ; CHECK-NEXT: .long 27 @ 0x1b
631 ; CHECK-NEXT: .long 30 @ 0x1e
632 ; CHECK-NEXT: .long 33 @ 0x21
633 ; CHECK-NEXT: .LCPI12_2:
634 ; CHECK-NEXT: .long 0 @ 0x0
635 ; CHECK-NEXT: .long 3 @ 0x3
636 ; CHECK-NEXT: .long 6 @ 0x6
637 ; CHECK-NEXT: .long 9 @ 0x9
638 ; CHECK-NEXT: .LCPI12_3:
639 ; CHECK-NEXT: .long 12 @ 0xc
640 ; CHECK-NEXT: .long 15 @ 0xf
641 ; CHECK-NEXT: .long 18 @ 0x12
642 ; CHECK-NEXT: .long 21 @ 0x15
644 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 256, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
645 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
646 ret <16 x i8> %gather
650 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) {
651 ; CHECK-LABEL: unscaled_v16i8_i8_biggep5:
652 ; CHECK: @ %bb.0: @ %entry
653 ; CHECK-NEXT: .save {r4, r5, r6, lr}
654 ; CHECK-NEXT: push {r4, r5, r6, lr}
655 ; CHECK-NEXT: .vsave {d8, d9}
656 ; CHECK-NEXT: vpush {d8, d9}
657 ; CHECK-NEXT: vmov.i32 q4, #0x100
658 ; CHECK-NEXT: vadd.i32 q3, q3, q4
659 ; CHECK-NEXT: vadd.i32 q2, q2, q4
660 ; CHECK-NEXT: vmov r3, r2, d7
661 ; CHECK-NEXT: vadd.i32 q1, q1, q4
662 ; CHECK-NEXT: vmov r0, r1, d6
663 ; CHECK-NEXT: vadd.i32 q3, q0, q4
664 ; CHECK-NEXT: vmov r5, r6, d5
665 ; CHECK-NEXT: ldrb.w lr, [r3]
666 ; CHECK-NEXT: ldrb r3, [r1]
667 ; CHECK-NEXT: ldrb.w r12, [r2]
668 ; CHECK-NEXT: ldrb r1, [r5]
669 ; CHECK-NEXT: vmov r2, r5, d6
670 ; CHECK-NEXT: ldrb r4, [r0]
671 ; CHECK-NEXT: ldrb r0, [r6]
672 ; CHECK-NEXT: ldrb r2, [r2]
673 ; CHECK-NEXT: ldrb r5, [r5]
674 ; CHECK-NEXT: vmov.8 q0[0], r2
675 ; CHECK-NEXT: vmov.8 q0[1], r5
676 ; CHECK-NEXT: vmov r2, r5, d7
677 ; CHECK-NEXT: ldrb r2, [r2]
678 ; CHECK-NEXT: ldrb r5, [r5]
679 ; CHECK-NEXT: vmov.8 q0[2], r2
680 ; CHECK-NEXT: vmov.8 q0[3], r5
681 ; CHECK-NEXT: vmov r2, r5, d2
682 ; CHECK-NEXT: ldrb r2, [r2]
683 ; CHECK-NEXT: ldrb r5, [r5]
684 ; CHECK-NEXT: vmov.8 q0[4], r2
685 ; CHECK-NEXT: vmov.8 q0[5], r5
686 ; CHECK-NEXT: vmov r2, r5, d3
687 ; CHECK-NEXT: ldrb r2, [r2]
688 ; CHECK-NEXT: ldrb r5, [r5]
689 ; CHECK-NEXT: vmov.8 q0[6], r2
690 ; CHECK-NEXT: vmov.8 q0[7], r5
691 ; CHECK-NEXT: vmov r2, r5, d4
692 ; CHECK-NEXT: ldrb r2, [r2]
693 ; CHECK-NEXT: ldrb r5, [r5]
694 ; CHECK-NEXT: vmov.8 q0[8], r2
695 ; CHECK-NEXT: vmov.8 q0[9], r5
696 ; CHECK-NEXT: vmov.8 q0[10], r1
697 ; CHECK-NEXT: vmov.8 q0[11], r0
698 ; CHECK-NEXT: vmov.8 q0[12], r4
699 ; CHECK-NEXT: vmov.8 q0[13], r3
700 ; CHECK-NEXT: vmov.8 q0[14], lr
701 ; CHECK-NEXT: vmov.8 q0[15], r12
702 ; CHECK-NEXT: vpop {d8, d9}
703 ; CHECK-NEXT: pop {r4, r5, r6, pc}
705 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256
706 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
707 ret <16 x i8> %gather
711 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) {
712 ; CHECK-LABEL: unscaled_v16i8_i8_biggep6:
713 ; CHECK: @ %bb.0: @ %entry
714 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
715 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
716 ; CHECK-NEXT: adr r1, .LCPI14_0
717 ; CHECK-NEXT: adr r4, .LCPI14_1
718 ; CHECK-NEXT: vldrw.u32 q0, [r1]
719 ; CHECK-NEXT: adr r7, .LCPI14_3
720 ; CHECK-NEXT: vadd.i32 q0, q0, r0
721 ; CHECK-NEXT: vmov r1, r2, d1
722 ; CHECK-NEXT: vmov r3, r5, d0
723 ; CHECK-NEXT: vldrw.u32 q0, [r4]
724 ; CHECK-NEXT: vadd.i32 q1, q0, r0
725 ; CHECK-NEXT: vmov r4, r6, d3
726 ; CHECK-NEXT: ldrb.w lr, [r1]
727 ; CHECK-NEXT: adr r1, .LCPI14_2
728 ; CHECK-NEXT: vldrw.u32 q0, [r1]
729 ; CHECK-NEXT: ldrb.w r12, [r2]
730 ; CHECK-NEXT: ldrb r1, [r5]
731 ; CHECK-NEXT: vadd.i32 q2, q0, r0
732 ; CHECK-NEXT: ldrb r3, [r3]
733 ; CHECK-NEXT: ldrb r2, [r6]
734 ; CHECK-NEXT: vmov r5, r6, d4
735 ; CHECK-NEXT: ldrb r4, [r4]
736 ; CHECK-NEXT: ldrb r5, [r5]
737 ; CHECK-NEXT: vmov.8 q0[0], r5
738 ; CHECK-NEXT: ldrb r5, [r6]
739 ; CHECK-NEXT: vmov.8 q0[1], r5
740 ; CHECK-NEXT: vmov r5, r6, d5
741 ; CHECK-NEXT: vldrw.u32 q2, [r7]
742 ; CHECK-NEXT: vadd.i32 q2, q2, r0
743 ; CHECK-NEXT: ldrb r0, [r5]
744 ; CHECK-NEXT: ldrb r6, [r6]
745 ; CHECK-NEXT: vmov.8 q0[2], r0
746 ; CHECK-NEXT: vmov r0, r5, d4
747 ; CHECK-NEXT: vmov.8 q0[3], r6
748 ; CHECK-NEXT: ldrb r0, [r0]
749 ; CHECK-NEXT: ldrb r5, [r5]
750 ; CHECK-NEXT: vmov.8 q0[4], r0
751 ; CHECK-NEXT: vmov.8 q0[5], r5
752 ; CHECK-NEXT: vmov r0, r5, d5
753 ; CHECK-NEXT: ldrb r0, [r0]
754 ; CHECK-NEXT: ldrb r5, [r5]
755 ; CHECK-NEXT: vmov.8 q0[6], r0
756 ; CHECK-NEXT: vmov.8 q0[7], r5
757 ; CHECK-NEXT: vmov r0, r5, d2
758 ; CHECK-NEXT: ldrb r0, [r0]
759 ; CHECK-NEXT: ldrb r5, [r5]
760 ; CHECK-NEXT: vmov.8 q0[8], r0
761 ; CHECK-NEXT: vmov.8 q0[9], r5
762 ; CHECK-NEXT: vmov.8 q0[10], r4
763 ; CHECK-NEXT: vmov.8 q0[11], r2
764 ; CHECK-NEXT: vmov.8 q0[12], r3
765 ; CHECK-NEXT: vmov.8 q0[13], r1
766 ; CHECK-NEXT: vmov.8 q0[14], lr
767 ; CHECK-NEXT: vmov.8 q0[15], r12
768 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
769 ; CHECK-NEXT: .p2align 4
770 ; CHECK-NEXT: @ %bb.1:
771 ; CHECK-NEXT: .LCPI14_0:
772 ; CHECK-NEXT: .long 37 @ 0x25
773 ; CHECK-NEXT: .long 40 @ 0x28
774 ; CHECK-NEXT: .long 43 @ 0x2b
775 ; CHECK-NEXT: .long 46 @ 0x2e
776 ; CHECK-NEXT: .LCPI14_1:
777 ; CHECK-NEXT: .long 257 @ 0x101
778 ; CHECK-NEXT: .long 28 @ 0x1c
779 ; CHECK-NEXT: .long 31 @ 0x1f
780 ; CHECK-NEXT: .long 34 @ 0x22
781 ; CHECK-NEXT: .LCPI14_2:
782 ; CHECK-NEXT: .long 1 @ 0x1
783 ; CHECK-NEXT: .long 4 @ 0x4
784 ; CHECK-NEXT: .long 7 @ 0x7
785 ; CHECK-NEXT: .long 10 @ 0xa
786 ; CHECK-NEXT: .LCPI14_3:
787 ; CHECK-NEXT: .long 13 @ 0xd
788 ; CHECK-NEXT: .long 16 @ 0x10
789 ; CHECK-NEXT: .long 19 @ 0x13
790 ; CHECK-NEXT: .long 22 @ 0x16
792 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 256, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
793 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 1
794 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
795 ret <16 x i8> %gather
799 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) {
800 ; CHECK-LABEL: unscaled_v16i8_i8_biggep7:
801 ; CHECK: @ %bb.0: @ %entry
802 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
803 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
804 ; CHECK-NEXT: adr r1, .LCPI15_0
805 ; CHECK-NEXT: adr r4, .LCPI15_1
806 ; CHECK-NEXT: vldrw.u32 q0, [r1]
807 ; CHECK-NEXT: adr r7, .LCPI15_3
808 ; CHECK-NEXT: vadd.i32 q0, q0, r0
809 ; CHECK-NEXT: vmov r1, r2, d1
810 ; CHECK-NEXT: vmov r3, r5, d0
811 ; CHECK-NEXT: vldrw.u32 q0, [r4]
812 ; CHECK-NEXT: vadd.i32 q1, q0, r0
813 ; CHECK-NEXT: vmov r4, r6, d3
814 ; CHECK-NEXT: ldrb.w lr, [r1]
815 ; CHECK-NEXT: adr r1, .LCPI15_2
816 ; CHECK-NEXT: vldrw.u32 q0, [r1]
817 ; CHECK-NEXT: ldrb.w r12, [r2]
818 ; CHECK-NEXT: ldrb r1, [r5]
819 ; CHECK-NEXT: vadd.i32 q2, q0, r0
820 ; CHECK-NEXT: ldrb r3, [r3]
821 ; CHECK-NEXT: ldrb r2, [r6]
822 ; CHECK-NEXT: vmov r5, r6, d4
823 ; CHECK-NEXT: ldrb r4, [r4]
824 ; CHECK-NEXT: ldrb r5, [r5]
825 ; CHECK-NEXT: vmov.8 q0[0], r5
826 ; CHECK-NEXT: ldrb r5, [r6]
827 ; CHECK-NEXT: vmov.8 q0[1], r5
828 ; CHECK-NEXT: vmov r5, r6, d5
829 ; CHECK-NEXT: vldrw.u32 q2, [r7]
830 ; CHECK-NEXT: vadd.i32 q2, q2, r0
831 ; CHECK-NEXT: ldrb r0, [r5]
832 ; CHECK-NEXT: ldrb r6, [r6]
833 ; CHECK-NEXT: vmov.8 q0[2], r0
834 ; CHECK-NEXT: vmov r0, r5, d4
835 ; CHECK-NEXT: vmov.8 q0[3], r6
836 ; CHECK-NEXT: ldrb r0, [r0]
837 ; CHECK-NEXT: ldrb r5, [r5]
838 ; CHECK-NEXT: vmov.8 q0[4], r0
839 ; CHECK-NEXT: vmov.8 q0[5], r5
840 ; CHECK-NEXT: vmov r0, r5, d5
841 ; CHECK-NEXT: ldrb r0, [r0]
842 ; CHECK-NEXT: ldrb r5, [r5]
843 ; CHECK-NEXT: vmov.8 q0[6], r0
844 ; CHECK-NEXT: vmov.8 q0[7], r5
845 ; CHECK-NEXT: vmov r0, r5, d2
846 ; CHECK-NEXT: ldrb r0, [r0]
847 ; CHECK-NEXT: ldrb r5, [r5]
848 ; CHECK-NEXT: vmov.8 q0[8], r0
849 ; CHECK-NEXT: vmov.8 q0[9], r5
850 ; CHECK-NEXT: vmov.8 q0[10], r4
851 ; CHECK-NEXT: vmov.8 q0[11], r2
852 ; CHECK-NEXT: vmov.8 q0[12], r3
853 ; CHECK-NEXT: vmov.8 q0[13], r1
854 ; CHECK-NEXT: vmov.8 q0[14], lr
855 ; CHECK-NEXT: vmov.8 q0[15], r12
856 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
857 ; CHECK-NEXT: .p2align 4
858 ; CHECK-NEXT: @ %bb.1:
859 ; CHECK-NEXT: .LCPI15_0:
860 ; CHECK-NEXT: .long 236 @ 0xec
861 ; CHECK-NEXT: .long 239 @ 0xef
862 ; CHECK-NEXT: .long 242 @ 0xf2
863 ; CHECK-NEXT: .long 245 @ 0xf5
864 ; CHECK-NEXT: .LCPI15_1:
865 ; CHECK-NEXT: .long 224 @ 0xe0
866 ; CHECK-NEXT: .long 227 @ 0xe3
867 ; CHECK-NEXT: .long 230 @ 0xe6
868 ; CHECK-NEXT: .long 233 @ 0xe9
869 ; CHECK-NEXT: .LCPI15_2:
870 ; CHECK-NEXT: .long 300 @ 0x12c
871 ; CHECK-NEXT: .long 203 @ 0xcb
872 ; CHECK-NEXT: .long 206 @ 0xce
873 ; CHECK-NEXT: .long 209 @ 0xd1
874 ; CHECK-NEXT: .LCPI15_3:
875 ; CHECK-NEXT: .long 212 @ 0xd4
876 ; CHECK-NEXT: .long 215 @ 0xd7
877 ; CHECK-NEXT: .long 218 @ 0xda
878 ; CHECK-NEXT: .long 221 @ 0xdd
880 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> <i32 100, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
881 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 200
882 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
883 ret <16 x i8> %gather
887 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) {
888 ; CHECK-LABEL: unscaled_v16i8_i8_2:
889 ; CHECK: @ %bb.0: @ %entry
890 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
891 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
892 ; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
893 ; CHECK-NEXT: vadd.i32 q0, q0, r0
894 ; CHECK-NEXT: vmov r2, r3, d1
895 ; CHECK-NEXT: vmov r4, r5, d0
896 ; CHECK-NEXT: vldrb.s32 q0, [r1]
897 ; CHECK-NEXT: vadd.i32 q2, q0, r0
898 ; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
899 ; CHECK-NEXT: vadd.i32 q1, q0, r0
900 ; CHECK-NEXT: ldrb.w r12, [r2]
901 ; CHECK-NEXT: ldrb.w lr, [r3]
902 ; CHECK-NEXT: ldrb r3, [r4]
903 ; CHECK-NEXT: ldrb r2, [r5]
904 ; CHECK-NEXT: vmov r4, r5, d4
905 ; CHECK-NEXT: ldrb r4, [r4]
906 ; CHECK-NEXT: ldrb r5, [r5]
907 ; CHECK-NEXT: vmov.8 q0[0], r4
908 ; CHECK-NEXT: vmov r4, r6, d3
909 ; CHECK-NEXT: vmov.8 q0[1], r5
910 ; CHECK-NEXT: ldrb r5, [r4]
911 ; CHECK-NEXT: ldrb r4, [r6]
912 ; CHECK-NEXT: vmov r6, r7, d5
913 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
914 ; CHECK-NEXT: vadd.i32 q2, q2, r0
915 ; CHECK-NEXT: ldrb r0, [r6]
916 ; CHECK-NEXT: ldrb r7, [r7]
917 ; CHECK-NEXT: vmov.8 q0[2], r0
918 ; CHECK-NEXT: vmov r0, r1, d4
919 ; CHECK-NEXT: vmov.8 q0[3], r7
920 ; CHECK-NEXT: ldrb r0, [r0]
921 ; CHECK-NEXT: ldrb r1, [r1]
922 ; CHECK-NEXT: vmov.8 q0[4], r0
923 ; CHECK-NEXT: vmov.8 q0[5], r1
924 ; CHECK-NEXT: vmov r0, r1, d5
925 ; CHECK-NEXT: ldrb r0, [r0]
926 ; CHECK-NEXT: ldrb r1, [r1]
927 ; CHECK-NEXT: vmov.8 q0[6], r0
928 ; CHECK-NEXT: vmov.8 q0[7], r1
929 ; CHECK-NEXT: vmov r0, r1, d2
930 ; CHECK-NEXT: ldrb r0, [r0]
931 ; CHECK-NEXT: ldrb r1, [r1]
932 ; CHECK-NEXT: vmov.8 q0[8], r0
933 ; CHECK-NEXT: vmov.8 q0[9], r1
934 ; CHECK-NEXT: vmov.8 q0[10], r5
935 ; CHECK-NEXT: vmov.8 q0[11], r4
936 ; CHECK-NEXT: vmov.8 q0[12], r3
937 ; CHECK-NEXT: vmov.8 q0[13], r2
938 ; CHECK-NEXT: vmov.8 q0[14], r12
939 ; CHECK-NEXT: vmov.8 q0[15], lr
940 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
942 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
943 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
944 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
945 ret <16 x i8> %gather
949 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) {
950 ; CHECK-LABEL: unscaled_v16i8_i8_3:
951 ; CHECK: @ %bb.0: @ %entry
952 ; CHECK-NEXT: adr r1, .LCPI17_0
953 ; CHECK-NEXT: vldrw.u32 q1, [r1]
954 ; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
956 ; CHECK-NEXT: .p2align 4
957 ; CHECK-NEXT: @ %bb.1:
958 ; CHECK-NEXT: .LCPI17_0:
959 ; CHECK-NEXT: .byte 0 @ 0x0
960 ; CHECK-NEXT: .byte 3 @ 0x3
961 ; CHECK-NEXT: .byte 6 @ 0x6
962 ; CHECK-NEXT: .byte 9 @ 0x9
963 ; CHECK-NEXT: .byte 12 @ 0xc
964 ; CHECK-NEXT: .byte 15 @ 0xf
965 ; CHECK-NEXT: .byte 18 @ 0x12
966 ; CHECK-NEXT: .byte 21 @ 0x15
967 ; CHECK-NEXT: .byte 24 @ 0x18
968 ; CHECK-NEXT: .byte 27 @ 0x1b
969 ; CHECK-NEXT: .byte 30 @ 0x1e
970 ; CHECK-NEXT: .byte 33 @ 0x21
971 ; CHECK-NEXT: .byte 36 @ 0x24
972 ; CHECK-NEXT: .byte 39 @ 0x27
973 ; CHECK-NEXT: .byte 42 @ 0x2a
974 ; CHECK-NEXT: .byte 45 @ 0x2d
976 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
977 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
978 ret <16 x i8> %gather
981 define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) {
982 ; CHECK-LABEL: unscaled_v16i8_basei16:
983 ; CHECK: @ %bb.0: @ %entry
984 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
985 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
986 ; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
987 ; CHECK-NEXT: vshl.i32 q0, q0, #1
988 ; CHECK-NEXT: vadd.i32 q0, q0, r0
989 ; CHECK-NEXT: vmov r2, r3, d1
990 ; CHECK-NEXT: vmov r4, r5, d0
991 ; CHECK-NEXT: vldrb.u32 q0, [r1]
992 ; CHECK-NEXT: vshl.i32 q0, q0, #1
993 ; CHECK-NEXT: vadd.i32 q2, q0, r0
994 ; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
995 ; CHECK-NEXT: vshl.i32 q0, q0, #1
996 ; CHECK-NEXT: vadd.i32 q1, q0, r0
997 ; CHECK-NEXT: ldrb.w r12, [r2]
998 ; CHECK-NEXT: ldrb.w lr, [r3]
999 ; CHECK-NEXT: ldrb r3, [r4]
1000 ; CHECK-NEXT: ldrb r2, [r5]
1001 ; CHECK-NEXT: vmov r4, r5, d4
1002 ; CHECK-NEXT: ldrb r4, [r4]
1003 ; CHECK-NEXT: ldrb r5, [r5]
1004 ; CHECK-NEXT: vmov.8 q0[0], r4
1005 ; CHECK-NEXT: vmov r4, r6, d3
1006 ; CHECK-NEXT: vmov.8 q0[1], r5
1007 ; CHECK-NEXT: ldrb r5, [r4]
1008 ; CHECK-NEXT: ldrb r4, [r6]
1009 ; CHECK-NEXT: vmov r6, r7, d5
1010 ; CHECK-NEXT: vldrb.u32 q2, [r1, #4]
1011 ; CHECK-NEXT: vshl.i32 q2, q2, #1
1012 ; CHECK-NEXT: vadd.i32 q2, q2, r0
1013 ; CHECK-NEXT: ldrb r0, [r6]
1014 ; CHECK-NEXT: ldrb r7, [r7]
1015 ; CHECK-NEXT: vmov.8 q0[2], r0
1016 ; CHECK-NEXT: vmov r0, r1, d4
1017 ; CHECK-NEXT: vmov.8 q0[3], r7
1018 ; CHECK-NEXT: ldrb r0, [r0]
1019 ; CHECK-NEXT: ldrb r1, [r1]
1020 ; CHECK-NEXT: vmov.8 q0[4], r0
1021 ; CHECK-NEXT: vmov.8 q0[5], r1
1022 ; CHECK-NEXT: vmov r0, r1, d5
1023 ; CHECK-NEXT: ldrb r0, [r0]
1024 ; CHECK-NEXT: ldrb r1, [r1]
1025 ; CHECK-NEXT: vmov.8 q0[6], r0
1026 ; CHECK-NEXT: vmov.8 q0[7], r1
1027 ; CHECK-NEXT: vmov r0, r1, d2
1028 ; CHECK-NEXT: ldrb r0, [r0]
1029 ; CHECK-NEXT: ldrb r1, [r1]
1030 ; CHECK-NEXT: vmov.8 q0[8], r0
1031 ; CHECK-NEXT: vmov.8 q0[9], r1
1032 ; CHECK-NEXT: vmov.8 q0[10], r5
1033 ; CHECK-NEXT: vmov.8 q0[11], r4
1034 ; CHECK-NEXT: vmov.8 q0[12], r3
1035 ; CHECK-NEXT: vmov.8 q0[13], r2
1036 ; CHECK-NEXT: vmov.8 q0[14], r12
1037 ; CHECK-NEXT: vmov.8 q0[15], lr
1038 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
1040 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
1041 %offs.zext = zext <16 x i8> %offs to <16 x i32>
1042 %ptrs = getelementptr inbounds i16, i16* %base, <16 x i32> %offs.zext
1043 %ptrs.cast = bitcast <16 x i16*> %ptrs to <16 x i8*>
1044 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs.cast, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
1045 ret <16 x i8> %gather
1048 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
1049 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
1050 declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>)