1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
4 define <8 x i8> @vld1dupi8(ptr %A) nounwind {
5 ; CHECK-LABEL: vld1dupi8:
7 ; CHECK-NEXT: vld1.8 {d16[]}, [r0]
8 ; CHECK-NEXT: vmov r0, r1, d16
9 ; CHECK-NEXT: mov pc, lr
10 ;Check the (default) alignment value.
11 %tmp1 = load i8, ptr %A, align 8
12 %tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0
13 %tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer
17 define <8 x i8> @vld1dupi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind {
18 ; CHECK-LABEL: vld1dupi8_preinc:
19 ; CHECK: @ %bb.0: @ %entry
20 ; CHECK-NEXT: ldr r2, [r0]
21 ; CHECK-NEXT: add r3, r2, r1
22 ; CHECK-NEXT: str r3, [r0]
23 ; CHECK-NEXT: vld1.8 {d16[]}, [r3]
24 ; CHECK-NEXT: vmov r2, r1, d16
25 ; CHECK-NEXT: mov r0, r2
26 ; CHECK-NEXT: mov pc, lr
28 %0 = load ptr, ptr %a, align 4
29 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b
30 %1 = load i8, ptr %add.ptr, align 1
31 %2 = insertelement <8 x i8> undef, i8 %1, i32 0
32 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
33 store ptr %add.ptr, ptr %a, align 4
37 define <8 x i8> @vld1dupi8_postinc_fixed(ptr noalias nocapture %a) nounwind {
38 ; CHECK-LABEL: vld1dupi8_postinc_fixed:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: ldr r3, [r0]
41 ; CHECK-NEXT: vld1.8 {d16[]}, [r3]!
42 ; CHECK-NEXT: str r3, [r0]
43 ; CHECK-NEXT: vmov r2, r1, d16
44 ; CHECK-NEXT: mov r0, r2
45 ; CHECK-NEXT: mov pc, lr
47 %0 = load ptr, ptr %a, align 4
48 %1 = load i8, ptr %0, align 1
49 %2 = insertelement <8 x i8> undef, i8 %1, i32 0
50 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
51 %add.ptr = getelementptr inbounds i8, ptr %0, i32 1
52 store ptr %add.ptr, ptr %a, align 4
56 define <8 x i8> @vld1dupi8_postinc_register(ptr noalias nocapture %a, i32 %n) nounwind {
57 ; CHECK-LABEL: vld1dupi8_postinc_register:
58 ; CHECK: @ %bb.0: @ %entry
59 ; CHECK-NEXT: ldr r3, [r0]
60 ; CHECK-NEXT: vld1.8 {d16[]}, [r3], r1
61 ; CHECK-NEXT: str r3, [r0]
62 ; CHECK-NEXT: vmov r2, r1, d16
63 ; CHECK-NEXT: mov r0, r2
64 ; CHECK-NEXT: mov pc, lr
66 %0 = load ptr, ptr %a, align 4
67 %1 = load i8, ptr %0, align 1
68 %2 = insertelement <8 x i8> undef, i8 %1, i32 0
69 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
70 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n
71 store ptr %add.ptr, ptr %a, align 4
75 define <16 x i8> @vld1dupqi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind {
76 ; CHECK-LABEL: vld1dupqi8_preinc:
77 ; CHECK: @ %bb.0: @ %entry
78 ; CHECK-NEXT: .save {r11, lr}
79 ; CHECK-NEXT: push {r11, lr}
80 ; CHECK-NEXT: ldr r2, [r0]
81 ; CHECK-NEXT: add lr, r2, r1
82 ; CHECK-NEXT: str lr, [r0]
83 ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr]
84 ; CHECK-NEXT: vmov r12, r1, d16
85 ; CHECK-NEXT: vmov r2, r3, d17
86 ; CHECK-NEXT: mov r0, r12
87 ; CHECK-NEXT: pop {r11, lr}
88 ; CHECK-NEXT: mov pc, lr
90 %0 = load ptr, ptr %a, align 4
91 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b
92 %1 = load i8, ptr %add.ptr, align 1
93 %2 = insertelement <16 x i8> undef, i8 %1, i32 0
94 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
95 store ptr %add.ptr, ptr %a, align 4
99 define <16 x i8> @vld1dupqi8_postinc_fixed(ptr noalias nocapture %a) nounwind {
100 ; CHECK-LABEL: vld1dupqi8_postinc_fixed:
101 ; CHECK: @ %bb.0: @ %entry
102 ; CHECK-NEXT: .save {r11, lr}
103 ; CHECK-NEXT: push {r11, lr}
104 ; CHECK-NEXT: ldr lr, [r0]
105 ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr]!
106 ; CHECK-NEXT: str lr, [r0]
107 ; CHECK-NEXT: vmov r12, r1, d16
108 ; CHECK-NEXT: vmov r2, r3, d17
109 ; CHECK-NEXT: mov r0, r12
110 ; CHECK-NEXT: pop {r11, lr}
111 ; CHECK-NEXT: mov pc, lr
113 %0 = load ptr, ptr %a, align 4
114 %1 = load i8, ptr %0, align 1
115 %2 = insertelement <16 x i8> undef, i8 %1, i32 0
116 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
117 %add.ptr = getelementptr inbounds i8, ptr %0, i32 1
118 store ptr %add.ptr, ptr %a, align 4
122 define <16 x i8> @vld1dupqi8_postinc_register(ptr noalias nocapture %a, i32 %n) nounwind {
123 ; CHECK-LABEL: vld1dupqi8_postinc_register:
124 ; CHECK: @ %bb.0: @ %entry
125 ; CHECK-NEXT: .save {r11, lr}
126 ; CHECK-NEXT: push {r11, lr}
127 ; CHECK-NEXT: ldr lr, [r0]
128 ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr], r1
129 ; CHECK-NEXT: str lr, [r0]
130 ; CHECK-NEXT: vmov r12, r1, d16
131 ; CHECK-NEXT: vmov r2, r3, d17
132 ; CHECK-NEXT: mov r0, r12
133 ; CHECK-NEXT: pop {r11, lr}
134 ; CHECK-NEXT: mov pc, lr
136 %0 = load ptr, ptr %a, align 4
137 %1 = load i8, ptr %0, align 1
138 %2 = insertelement <16 x i8> undef, i8 %1, i32 0
139 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
140 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n
141 store ptr %add.ptr, ptr %a, align 4
145 define <4 x i16> @vld1dupi16(ptr %A) nounwind {
146 ; CHECK-LABEL: vld1dupi16:
148 ; CHECK-NEXT: vld1.16 {d16[]}, [r0:16]
149 ; CHECK-NEXT: vmov r0, r1, d16
150 ; CHECK-NEXT: mov pc, lr
151 ;Check the alignment value. Max for this instruction is 16 bits:
152 %tmp1 = load i16, ptr %A, align 8
153 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
154 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
158 define <4 x i16> @vld1dupi16_misaligned(ptr %A) nounwind {
159 ; CHECK-LABEL: vld1dupi16_misaligned:
161 ; CHECK-NEXT: vld1.16 {d16[]}, [r0]
162 ; CHECK-NEXT: vmov r0, r1, d16
163 ; CHECK-NEXT: mov pc, lr
164 %tmp1 = load i16, ptr %A, align 1
165 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
166 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
170 ; This sort of looks like a vld1dup, but there's an extension in the way.
171 define <4 x i16> @load_i16_dup_zext(ptr %A) nounwind {
172 ; CHECK-LABEL: load_i16_dup_zext:
174 ; CHECK-NEXT: ldrb r0, [r0]
175 ; CHECK-NEXT: vdup.16 d16, r0
176 ; CHECK-NEXT: vmov r0, r1, d16
177 ; CHECK-NEXT: mov pc, lr
178 %tmp1 = load i8, ptr %A, align 1
179 %tmp2 = zext i8 %tmp1 to i16
180 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
181 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
185 ; This sort of looks like a vld1dup, but there's an extension in the way.
186 define <4 x i16> @load_i16_dup_sext(ptr %A) nounwind {
187 ; CHECK-LABEL: load_i16_dup_sext:
189 ; CHECK-NEXT: ldrsb r0, [r0]
190 ; CHECK-NEXT: vdup.16 d16, r0
191 ; CHECK-NEXT: vmov r0, r1, d16
192 ; CHECK-NEXT: mov pc, lr
193 %tmp1 = load i8, ptr %A, align 1
194 %tmp2 = sext i8 %tmp1 to i16
195 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
196 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
200 ; This sort of looks like a vld1dup, but there's an extension in the way.
201 define <8 x i16> @load_i16_dupq_zext(ptr %A) nounwind {
202 ; CHECK-LABEL: load_i16_dupq_zext:
204 ; CHECK-NEXT: ldrb r0, [r0]
205 ; CHECK-NEXT: vdup.16 q8, r0
206 ; CHECK-NEXT: vmov r0, r1, d16
207 ; CHECK-NEXT: vmov r2, r3, d17
208 ; CHECK-NEXT: mov pc, lr
209 %tmp1 = load i8, ptr %A, align 1
210 %tmp2 = zext i8 %tmp1 to i16
211 %tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0
212 %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
216 define <2 x i32> @vld1dupi32(ptr %A) nounwind {
217 ; CHECK-LABEL: vld1dupi32:
219 ; CHECK-NEXT: vld1.32 {d16[]}, [r0:32]
220 ; CHECK-NEXT: vmov r0, r1, d16
221 ; CHECK-NEXT: mov pc, lr
222 ;Check the alignment value. Max for this instruction is 32 bits:
223 %tmp1 = load i32, ptr %A, align 8
224 %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
225 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
229 ; This sort of looks like a vld1dup, but there's an extension in the way.
230 define <4 x i32> @load_i32_dup_zext(ptr %A) nounwind {
231 ; CHECK-LABEL: load_i32_dup_zext:
233 ; CHECK-NEXT: ldrb r0, [r0]
234 ; CHECK-NEXT: vdup.32 q8, r0
235 ; CHECK-NEXT: vmov r0, r1, d16
236 ; CHECK-NEXT: vmov r2, r3, d17
237 ; CHECK-NEXT: mov pc, lr
238 %tmp1 = load i8, ptr %A, align 1
239 %tmp2 = zext i8 %tmp1 to i32
240 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
241 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
245 ; This sort of looks like a vld1dup, but there's an extension in the way.
246 define <4 x i32> @load_i32_dup_sext(ptr %A) nounwind {
247 ; CHECK-LABEL: load_i32_dup_sext:
249 ; CHECK-NEXT: ldrsb r0, [r0]
250 ; CHECK-NEXT: vdup.32 q8, r0
251 ; CHECK-NEXT: vmov r0, r1, d16
252 ; CHECK-NEXT: vmov r2, r3, d17
253 ; CHECK-NEXT: mov pc, lr
254 %tmp1 = load i8, ptr %A, align 1
255 %tmp2 = sext i8 %tmp1 to i32
256 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
257 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
261 define <2 x float> @vld1dupf(ptr %A) nounwind {
262 ; CHECK-LABEL: vld1dupf:
264 ; CHECK-NEXT: vld1.32 {d16[]}, [r0:32]
265 ; CHECK-NEXT: vmov r0, r1, d16
266 ; CHECK-NEXT: mov pc, lr
267 %tmp0 = load float, ptr %A
268 %tmp1 = insertelement <2 x float> undef, float %tmp0, i32 0
269 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
270 ret <2 x float> %tmp2
273 define <16 x i8> @vld1dupQi8(ptr %A) nounwind {
274 ; CHECK-LABEL: vld1dupQi8:
276 ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [r0]
277 ; CHECK-NEXT: vmov r0, r1, d16
278 ; CHECK-NEXT: vmov r2, r3, d17
279 ; CHECK-NEXT: mov pc, lr
280 ;Check the (default) alignment value.
281 %tmp1 = load i8, ptr %A, align 8
282 %tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0
283 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
287 define <4 x float> @vld1dupQf(ptr %A) nounwind {
288 ; CHECK-LABEL: vld1dupQf:
290 ; CHECK-NEXT: vld1.32 {d16[], d17[]}, [r0:32]
291 ; CHECK-NEXT: vmov r0, r1, d16
292 ; CHECK-NEXT: vmov r2, r3, d17
293 ; CHECK-NEXT: mov pc, lr
294 %tmp0 = load float, ptr %A
295 %tmp1 = insertelement <4 x float> undef, float %tmp0, i32 0
296 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
297 ret <4 x float> %tmp2
300 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
301 %struct.__neon_int4x16x2_t = type { <4 x i16>, <4 x i16> }
302 %struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> }
304 define <8 x i8> @vld2dupi8(ptr %A) nounwind {
305 ; CHECK-LABEL: vld2dupi8:
307 ; CHECK-NEXT: vld2.8 {d16[0], d17[0]}, [r0]
308 ; CHECK-NEXT: vadd.i8 d16, d16, d17
309 ; CHECK-NEXT: vdup.8 d16, d16[0]
310 ; CHECK-NEXT: vmov r0, r1, d16
311 ; CHECK-NEXT: mov pc, lr
312 ;Check the (default) alignment value.
313 %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
314 %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
315 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
316 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
317 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
318 %tmp5 = add <8 x i8> %tmp2, %tmp4
322 define void @vld2dupi8_preinc(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a, i32 %b) nounwind {
323 ; CHECK-LABEL: vld2dupi8_preinc:
324 ; CHECK: @ %bb.0: @ %entry
325 ; CHECK-NEXT: ldr r3, [r1]
326 ; CHECK-NEXT: add r2, r3, r2
327 ; CHECK-NEXT: str r2, [r1]
328 ; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r2]
329 ; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
330 ; CHECK-NEXT: vstr d17, [r0]
331 ; CHECK-NEXT: mov pc, lr
333 %0 = load ptr, ptr %a, align 4
334 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b
335 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
336 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
337 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
338 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
339 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
340 store ptr %add.ptr, ptr %a, align 4
341 store <8 x i8> %lane, ptr %agg.result, align 8
342 %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1
343 store <8 x i8> %lane1, ptr %r11, align 8
347 define void @vld2dupi8_postinc_fixed(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a) nounwind {
348 ; CHECK-LABEL: vld2dupi8_postinc_fixed:
349 ; CHECK: @ %bb.0: @ %entry
350 ; CHECK-NEXT: ldr r2, [r1]
351 ; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r2]!
352 ; CHECK-NEXT: str r2, [r1]
353 ; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
354 ; CHECK-NEXT: vstr d17, [r0]
355 ; CHECK-NEXT: mov pc, lr
357 %0 = load ptr, ptr %a, align 4
358 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
359 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
360 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
361 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
362 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
363 %add.ptr = getelementptr inbounds i8, ptr %0, i32 2
364 store ptr %add.ptr, ptr %a, align 4
365 store <8 x i8> %lane, ptr %agg.result, align 8
366 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1
367 store <8 x i8> %lane1, ptr %r10, align 8
371 define void @vld2dupi8_postinc_variable(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a, i32 %n) nounwind {
372 ; CHECK-LABEL: vld2dupi8_postinc_variable:
373 ; CHECK: @ %bb.0: @ %entry
374 ; CHECK-NEXT: ldr r3, [r1]
375 ; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r3], r2
376 ; CHECK-NEXT: str r3, [r1]
377 ; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
378 ; CHECK-NEXT: vstr d17, [r0]
379 ; CHECK-NEXT: mov pc, lr
381 %0 = load ptr, ptr %a, align 4
382 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
383 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
384 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
385 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
386 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
387 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n
388 store ptr %add.ptr, ptr %a, align 4
389 store <8 x i8> %lane, ptr %agg.result, align 8
390 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1
391 store <8 x i8> %lane1, ptr %r10, align 8
395 define <4 x i16> @vld2dupi16(ptr %A) nounwind {
396 ; CHECK-LABEL: vld2dupi16:
398 ; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r0]
399 ; CHECK-NEXT: vadd.i16 d16, d16, d17
400 ; CHECK-NEXT: vdup.16 d16, d16[0]
401 ; CHECK-NEXT: vmov r0, r1, d16
402 ; CHECK-NEXT: mov pc, lr
403 ;Check that a power-of-two alignment smaller than the total size of the memory
404 ;being loaded is ignored.
405 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
406 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
407 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
408 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
409 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
410 %tmp5 = add <4 x i16> %tmp2, %tmp4
414 ;Check for a post-increment updating load.
415 define <4 x i16> @vld2dupi16_update(ptr %ptr) nounwind {
416 ; CHECK-LABEL: vld2dupi16_update:
418 ; CHECK-NEXT: ldr r3, [r0]
419 ; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r3]!
420 ; CHECK-NEXT: vadd.i16 d16, d16, d17
421 ; CHECK-NEXT: str r3, [r0]
422 ; CHECK-NEXT: vdup.16 d16, d16[0]
423 ; CHECK-NEXT: vmov r2, r1, d16
424 ; CHECK-NEXT: mov r0, r2
425 ; CHECK-NEXT: mov pc, lr
426 %A = load ptr, ptr %ptr
427 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
428 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
429 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
430 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
431 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
432 %tmp5 = add <4 x i16> %tmp2, %tmp4
433 %tmp6 = getelementptr i16, ptr %A, i32 2
434 store ptr %tmp6, ptr %ptr
438 define <4 x i16> @vld2dupi16_odd_update(ptr %ptr) nounwind {
439 ; CHECK-LABEL: vld2dupi16_odd_update:
441 ; CHECK-NEXT: ldr r3, [r0]
442 ; CHECK-NEXT: mov r1, #6
443 ; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r3], r1
444 ; CHECK-NEXT: vadd.i16 d16, d16, d17
445 ; CHECK-NEXT: str r3, [r0]
446 ; CHECK-NEXT: vdup.16 d16, d16[0]
447 ; CHECK-NEXT: vmov r2, r1, d16
448 ; CHECK-NEXT: mov r0, r2
449 ; CHECK-NEXT: mov pc, lr
450 %A = load ptr, ptr %ptr
451 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
452 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
453 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
454 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
455 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
456 %tmp5 = add <4 x i16> %tmp2, %tmp4
457 %tmp6 = getelementptr i16, ptr %A, i32 3
458 store ptr %tmp6, ptr %ptr
462 define <2 x i32> @vld2dupi32(ptr %A) nounwind {
463 ; CHECK-LABEL: vld2dupi32:
465 ; CHECK-NEXT: vld2.32 {d16[0], d17[0]}, [r0:64]
466 ; CHECK-NEXT: vadd.i32 d16, d16, d17
467 ; CHECK-NEXT: vdup.32 d16, d16[0]
468 ; CHECK-NEXT: vmov r0, r1, d16
469 ; CHECK-NEXT: mov pc, lr
470 ;Check the alignment value. Max for this instruction is 64 bits:
471 %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
472 %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
473 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
474 %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
475 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
476 %tmp5 = add <2 x i32> %tmp2, %tmp4
480 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
481 declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
482 declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
484 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
485 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
487 ;Check for a post-increment updating load with register increment.
488 define <8 x i8> @vld3dupi8_update(ptr %ptr, i32 %inc) nounwind {
489 ; CHECK-LABEL: vld3dupi8_update:
491 ; CHECK-NEXT: ldr r3, [r0]
492 ; CHECK-NEXT: vld3.8 {d16[0], d17[0], d18[0]}, [r3], r1
493 ; CHECK-NEXT: vadd.i8 d20, d16, d17
494 ; CHECK-NEXT: vadd.i8 d16, d20, d18
495 ; CHECK-NEXT: str r3, [r0]
496 ; CHECK-NEXT: vdup.8 d16, d16[0]
497 ; CHECK-NEXT: vmov r2, r1, d16
498 ; CHECK-NEXT: mov r0, r2
499 ; CHECK-NEXT: mov pc, lr
500 %A = load ptr, ptr %ptr
501 %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
502 %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
503 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
504 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
505 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
506 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 2
507 %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <8 x i32> zeroinitializer
508 %tmp7 = add <8 x i8> %tmp2, %tmp4
509 %tmp8 = add <8 x i8> %tmp7, %tmp6
510 %tmp9 = getelementptr i8, ptr %A, i32 %inc
511 store ptr %tmp9, ptr %ptr
515 define <4 x i16> @vld3dupi16(ptr %A) nounwind {
516 ; CHECK-LABEL: vld3dupi16:
518 ; CHECK-NEXT: vld3.16 {d16[0], d17[0], d18[0]}, [r0]
519 ; CHECK-NEXT: vadd.i16 d20, d16, d17
520 ; CHECK-NEXT: vadd.i16 d16, d20, d18
521 ; CHECK-NEXT: vdup.16 d16, d16[0]
522 ; CHECK-NEXT: vmov r0, r1, d16
523 ; CHECK-NEXT: mov pc, lr
524 ;Check the (default) alignment value. VLD3 does not support alignment.
525 %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
526 %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
527 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
528 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
529 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
530 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2
531 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
532 %tmp7 = add <4 x i16> %tmp2, %tmp4
533 %tmp8 = add <4 x i16> %tmp7, %tmp6
537 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
538 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
540 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
541 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
543 ;Check for a post-increment updating load.
544 define <4 x i16> @vld4dupi16_update(ptr %ptr) nounwind {
545 ; CHECK-LABEL: vld4dupi16_update:
547 ; CHECK-NEXT: ldr r3, [r0]
548 ; CHECK-NEXT: vld4.16 {d16[0], d17[0], d18[0], d19[0]}, [r3]!
549 ; CHECK-NEXT: vadd.i16 d16, d16, d17
550 ; CHECK-NEXT: vadd.i16 d20, d18, d19
551 ; CHECK-NEXT: str r3, [r0]
552 ; CHECK-NEXT: vadd.i16 d16, d16, d20
553 ; CHECK-NEXT: vdup.16 d16, d16[0]
554 ; CHECK-NEXT: vmov r2, r1, d16
555 ; CHECK-NEXT: mov r0, r2
556 ; CHECK-NEXT: mov pc, lr
557 %A = load ptr, ptr %ptr
558 %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
559 %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
560 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
561 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
562 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
563 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2
564 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
565 %tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3
566 %tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer
567 %tmp9 = add <4 x i16> %tmp2, %tmp4
568 %tmp10 = add <4 x i16> %tmp6, %tmp8
569 %tmp11 = add <4 x i16> %tmp9, %tmp10
570 %tmp12 = getelementptr i16, ptr %A, i32 4
571 store ptr %tmp12, ptr %ptr
575 define <2 x i32> @vld4dupi32(ptr %A) nounwind {
576 ; CHECK-LABEL: vld4dupi32:
578 ; CHECK-NEXT: vld4.32 {d16[0], d17[0], d18[0], d19[0]}, [r0:64]
579 ; CHECK-NEXT: vadd.i32 d16, d16, d17
580 ; CHECK-NEXT: vadd.i32 d20, d18, d19
581 ; CHECK-NEXT: vadd.i32 d16, d16, d20
582 ; CHECK-NEXT: vdup.32 d16, d16[0]
583 ; CHECK-NEXT: vmov r0, r1, d16
584 ; CHECK-NEXT: mov pc, lr
585 ;Check the alignment value. An 8-byte alignment is allowed here even though
586 ;it is smaller than the total size of the memory being loaded.
587 %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
588 %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
589 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
590 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
591 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
592 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2
593 %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer
594 %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3
595 %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer
596 %tmp9 = add <2 x i32> %tmp2, %tmp4
597 %tmp10 = add <2 x i32> %tmp6, %tmp8
598 %tmp11 = add <2 x i32> %tmp9, %tmp10
602 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
603 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly