1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon | FileCheck %s --check-prefixes=CHECK,DEFAULT
3 ; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic | FileCheck %s --check-prefixes=CHECK,BASIC
5 ;Check the (default) alignment value.
6 define <8 x i8> @vld1lanei8(ptr %A, ptr %B) nounwind {
7 ; CHECK-LABEL: vld1lanei8:
9 ; CHECK-NEXT: vldr d16, [r1]
10 ; CHECK-NEXT: vld1.8 {d16[3]}, [r0]
11 ; CHECK-NEXT: vmov r0, r1, d16
12 ; CHECK-NEXT: mov pc, lr
13 %tmp1 = load <8 x i8>, ptr %B
14 %tmp2 = load i8, ptr %A, align 8
15 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
19 ;Check the alignment value. Max for this instruction is 16 bits:
20 define <4 x i16> @vld1lanei16(ptr %A, ptr %B) nounwind {
21 ; CHECK-LABEL: vld1lanei16:
23 ; CHECK-NEXT: vldr d16, [r1]
24 ; CHECK-NEXT: vld1.16 {d16[2]}, [r0:16]
25 ; CHECK-NEXT: vmov r0, r1, d16
26 ; CHECK-NEXT: mov pc, lr
27 %tmp1 = load <4 x i16>, ptr %B
28 %tmp2 = load i16, ptr %A, align 8
29 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
33 ;Check the alignment value. Max for this instruction is 32 bits:
34 define <2 x i32> @vld1lanei32(ptr %A, ptr %B) nounwind {
35 ; CHECK-LABEL: vld1lanei32:
37 ; CHECK-NEXT: vldr d16, [r1]
38 ; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32]
39 ; CHECK-NEXT: vmov r0, r1, d16
40 ; CHECK-NEXT: mov pc, lr
41 %tmp1 = load <2 x i32>, ptr %B
42 %tmp2 = load i32, ptr %A, align 8
43 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
47 ;Check the alignment value. Legal values are none or :32.
48 define <2 x i32> @vld1lanei32a32(ptr %A, ptr %B) nounwind {
49 ; CHECK-LABEL: vld1lanei32a32:
51 ; CHECK-NEXT: vldr d16, [r1]
52 ; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32]
53 ; CHECK-NEXT: vmov r0, r1, d16
54 ; CHECK-NEXT: mov pc, lr
55 %tmp1 = load <2 x i32>, ptr %B
56 %tmp2 = load i32, ptr %A, align 4
57 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
61 define <2 x float> @vld1lanef(ptr %A, ptr %B) nounwind {
62 ; CHECK-LABEL: vld1lanef:
64 ; CHECK-NEXT: vldr d16, [r1]
65 ; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32]
66 ; CHECK-NEXT: vmov r0, r1, d16
67 ; CHECK-NEXT: mov pc, lr
68 %tmp1 = load <2 x float>, ptr %B
69 %tmp2 = load float, ptr %A, align 4
70 %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
74 define <16 x i8> @vld1laneQi8(ptr %A, ptr %B) nounwind {
75 ; CHECK-LABEL: vld1laneQi8:
77 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
78 ; CHECK-NEXT: vld1.8 {d17[1]}, [r0]
79 ; CHECK-NEXT: vmov r0, r1, d16
80 ; CHECK-NEXT: vmov r2, r3, d17
81 ; CHECK-NEXT: mov pc, lr
82 %tmp1 = load <16 x i8>, ptr %B
83 %tmp2 = load i8, ptr %A, align 8
84 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
88 define <8 x i16> @vld1laneQi16(ptr %A, ptr %B) nounwind {
89 ; CHECK-LABEL: vld1laneQi16:
91 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
92 ; CHECK-NEXT: vld1.16 {d17[1]}, [r0:16]
93 ; CHECK-NEXT: vmov r0, r1, d16
94 ; CHECK-NEXT: vmov r2, r3, d17
95 ; CHECK-NEXT: mov pc, lr
96 %tmp1 = load <8 x i16>, ptr %B
97 %tmp2 = load i16, ptr %A, align 8
98 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
102 define <4 x i32> @vld1laneQi32(ptr %A, ptr %B) nounwind {
103 ; CHECK-LABEL: vld1laneQi32:
105 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
106 ; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32]
107 ; CHECK-NEXT: vmov r0, r1, d16
108 ; CHECK-NEXT: vmov r2, r3, d17
109 ; CHECK-NEXT: mov pc, lr
110 %tmp1 = load <4 x i32>, ptr %B
111 %tmp2 = load i32, ptr %A, align 8
112 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
116 define <4 x float> @vld1laneQf(ptr %A, ptr %B) nounwind {
117 ; CHECK-LABEL: vld1laneQf:
119 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
120 ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32]
121 ; CHECK-NEXT: vmov r2, r3, d17
122 ; CHECK-NEXT: vmov r0, r1, d16
123 ; CHECK-NEXT: mov pc, lr
124 %tmp1 = load <4 x float>, ptr %B
125 %tmp2 = load float, ptr %A
126 %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
127 ret <4 x float> %tmp3
130 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
131 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
132 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
133 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
135 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
136 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
137 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
139 ;Check the alignment value. Max for this instruction is 16 bits:
140 define <8 x i8> @vld2lanei8(ptr %A, ptr %B) nounwind {
141 ; CHECK-LABEL: vld2lanei8:
143 ; CHECK-NEXT: vldr d16, [r1]
144 ; CHECK-NEXT: vorr d17, d16, d16
145 ; CHECK-NEXT: vld2.8 {d16[1], d17[1]}, [r0:16]
146 ; CHECK-NEXT: vadd.i8 d16, d16, d17
147 ; CHECK-NEXT: vmov r0, r1, d16
148 ; CHECK-NEXT: mov pc, lr
149 %tmp1 = load <8 x i8>, ptr %B
150 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
151 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
152 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
153 %tmp5 = add <8 x i8> %tmp3, %tmp4
157 ;Check the alignment value. Max for this instruction is 32 bits:
158 define <4 x i16> @vld2lanei16(ptr %A, ptr %B) nounwind {
159 ; CHECK-LABEL: vld2lanei16:
161 ; CHECK-NEXT: vldr d16, [r1]
162 ; CHECK-NEXT: vorr d17, d16, d16
163 ; CHECK-NEXT: vld2.16 {d16[1], d17[1]}, [r0:32]
164 ; CHECK-NEXT: vadd.i16 d16, d16, d17
165 ; CHECK-NEXT: vmov r0, r1, d16
166 ; CHECK-NEXT: mov pc, lr
167 %tmp1 = load <4 x i16>, ptr %B
168 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
169 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
170 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
171 %tmp5 = add <4 x i16> %tmp3, %tmp4
175 define <2 x i32> @vld2lanei32(ptr %A, ptr %B) nounwind {
176 ; CHECK-LABEL: vld2lanei32:
178 ; CHECK-NEXT: vldr d16, [r1]
179 ; CHECK-NEXT: vorr d17, d16, d16
180 ; CHECK-NEXT: vld2.32 {d16[1], d17[1]}, [r0]
181 ; CHECK-NEXT: vadd.i32 d16, d16, d17
182 ; CHECK-NEXT: vmov r0, r1, d16
183 ; CHECK-NEXT: mov pc, lr
184 %tmp1 = load <2 x i32>, ptr %B
185 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
186 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
187 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
188 %tmp5 = add <2 x i32> %tmp3, %tmp4
192 ;Check for a post-increment updating load.
193 define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind {
194 ; DEFAULT-LABEL: vld2lanei32_update:
196 ; DEFAULT-NEXT: vldr d16, [r1]
197 ; DEFAULT-NEXT: ldr r3, [r0]
198 ; DEFAULT-NEXT: vorr d17, d16, d16
199 ; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3]!
200 ; DEFAULT-NEXT: vadd.i32 d16, d16, d17
201 ; DEFAULT-NEXT: str r3, [r0]
202 ; DEFAULT-NEXT: vmov r2, r1, d16
203 ; DEFAULT-NEXT: mov r0, r2
204 ; DEFAULT-NEXT: mov pc, lr
206 ; BASIC-LABEL: vld2lanei32_update:
208 ; BASIC-NEXT: mov r2, r1
209 ; BASIC-NEXT: mov r1, r0
210 ; BASIC-NEXT: vldr d16, [r2]
211 ; BASIC-NEXT: ldr r0, [r0]
212 ; BASIC-NEXT: vorr d17, d16, d16
213 ; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0]!
214 ; BASIC-NEXT: vadd.i32 d16, d16, d17
215 ; BASIC-NEXT: str r0, [r1]
216 ; BASIC-NEXT: vmov r2, r3, d16
217 ; BASIC-NEXT: mov r0, r2
218 ; BASIC-NEXT: mov r1, r3
219 ; BASIC-NEXT: mov pc, lr
220 %A = load ptr, ptr %ptr
221 %tmp1 = load <2 x i32>, ptr %B
222 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
223 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
224 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
225 %tmp5 = add <2 x i32> %tmp3, %tmp4
226 %tmp6 = getelementptr i32, ptr %A, i32 2
227 store ptr %tmp6, ptr %ptr
231 define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind {
232 ; DEFAULT-LABEL: vld2lanei32_odd_update:
234 ; DEFAULT-NEXT: vldr d16, [r1]
235 ; DEFAULT-NEXT: mov r1, #12
236 ; DEFAULT-NEXT: ldr r3, [r0]
237 ; DEFAULT-NEXT: vorr d17, d16, d16
238 ; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3], r1
239 ; DEFAULT-NEXT: vadd.i32 d16, d16, d17
240 ; DEFAULT-NEXT: str r3, [r0]
241 ; DEFAULT-NEXT: vmov r2, r1, d16
242 ; DEFAULT-NEXT: mov r0, r2
243 ; DEFAULT-NEXT: mov pc, lr
245 ; BASIC-LABEL: vld2lanei32_odd_update:
247 ; BASIC-NEXT: mov r2, r1
248 ; BASIC-NEXT: mov r1, r0
249 ; BASIC-NEXT: vldr d16, [r2]
250 ; BASIC-NEXT: mov r2, #12
251 ; BASIC-NEXT: ldr r0, [r0]
252 ; BASIC-NEXT: vorr d17, d16, d16
253 ; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0], r2
254 ; BASIC-NEXT: vadd.i32 d16, d16, d17
255 ; BASIC-NEXT: str r0, [r1]
256 ; BASIC-NEXT: vmov r2, r3, d16
257 ; BASIC-NEXT: mov r0, r2
258 ; BASIC-NEXT: mov r1, r3
259 ; BASIC-NEXT: mov pc, lr
260 %A = load ptr, ptr %ptr
261 %tmp1 = load <2 x i32>, ptr %B
262 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
263 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
264 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
265 %tmp5 = add <2 x i32> %tmp3, %tmp4
266 %tmp6 = getelementptr i32, ptr %A, i32 3
267 store ptr %tmp6, ptr %ptr
271 define <2 x float> @vld2lanef(ptr %A, ptr %B) nounwind {
272 ; CHECK-LABEL: vld2lanef:
274 ; CHECK-NEXT: vldr d16, [r1]
275 ; CHECK-NEXT: vorr d17, d16, d16
276 ; CHECK-NEXT: vld2.32 {d16[1], d17[1]}, [r0]
277 ; CHECK-NEXT: vadd.f32 d16, d16, d17
278 ; CHECK-NEXT: vmov r0, r1, d16
279 ; CHECK-NEXT: mov pc, lr
280 %tmp1 = load <2 x float>, ptr %B
281 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
282 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
283 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
284 %tmp5 = fadd <2 x float> %tmp3, %tmp4
285 ret <2 x float> %tmp5
288 ;Check the (default) alignment.
289 define <8 x i16> @vld2laneQi16(ptr %A, ptr %B) nounwind {
290 ; CHECK-LABEL: vld2laneQi16:
292 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
293 ; CHECK-NEXT: vorr q9, q8, q8
294 ; CHECK-NEXT: vld2.16 {d17[1], d19[1]}, [r0]
295 ; CHECK-NEXT: vadd.i16 q8, q8, q9
296 ; CHECK-NEXT: vmov r0, r1, d16
297 ; CHECK-NEXT: vmov r2, r3, d17
298 ; CHECK-NEXT: mov pc, lr
299 %tmp1 = load <8 x i16>, ptr %B
300 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
301 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
302 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
303 %tmp5 = add <8 x i16> %tmp3, %tmp4
307 ;Check the alignment value. Max for this instruction is 64 bits:
308 define <4 x i32> @vld2laneQi32(ptr %A, ptr %B) nounwind {
309 ; CHECK-LABEL: vld2laneQi32:
311 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
312 ; CHECK-NEXT: vorr q9, q8, q8
313 ; CHECK-NEXT: vld2.32 {d17[0], d19[0]}, [r0:64]
314 ; CHECK-NEXT: vadd.i32 q8, q8, q9
315 ; CHECK-NEXT: vmov r0, r1, d16
316 ; CHECK-NEXT: vmov r2, r3, d17
317 ; CHECK-NEXT: mov pc, lr
318 %tmp1 = load <4 x i32>, ptr %B
319 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
320 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
321 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
322 %tmp5 = add <4 x i32> %tmp3, %tmp4
326 define <4 x float> @vld2laneQf(ptr %A, ptr %B) nounwind {
327 ; CHECK-LABEL: vld2laneQf:
329 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
330 ; CHECK-NEXT: vorr q9, q8, q8
331 ; CHECK-NEXT: vld2.32 {d16[1], d18[1]}, [r0]
332 ; CHECK-NEXT: vadd.f32 q8, q8, q9
333 ; CHECK-NEXT: vmov r0, r1, d16
334 ; CHECK-NEXT: vmov r2, r3, d17
335 ; CHECK-NEXT: mov pc, lr
336 %tmp1 = load <4 x float>, ptr %B
337 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
338 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
339 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
340 %tmp5 = fadd <4 x float> %tmp3, %tmp4
341 ret <4 x float> %tmp5
344 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
345 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
346 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
347 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0(ptr, <2 x float>, <2 x float>, i32, i32) nounwind readonly
349 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
350 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
351 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0(ptr, <4 x float>, <4 x float>, i32, i32) nounwind readonly
353 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
354 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
355 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
356 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
358 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
359 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
360 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
362 define <8 x i8> @vld3lanei8(ptr %A, ptr %B) nounwind {
363 ; DEFAULT-LABEL: vld3lanei8:
365 ; DEFAULT-NEXT: vldr d16, [r1]
366 ; DEFAULT-NEXT: vorr d17, d16, d16
367 ; DEFAULT-NEXT: vorr d18, d16, d16
368 ; DEFAULT-NEXT: vld3.8 {d16[1], d17[1], d18[1]}, [r0]
369 ; DEFAULT-NEXT: vadd.i8 d20, d16, d17
370 ; DEFAULT-NEXT: vadd.i8 d16, d18, d20
371 ; DEFAULT-NEXT: vmov r0, r1, d16
372 ; DEFAULT-NEXT: mov pc, lr
374 ; BASIC-LABEL: vld3lanei8:
376 ; BASIC-NEXT: vldr d18, [r1]
377 ; BASIC-NEXT: vorr d19, d18, d18
378 ; BASIC-NEXT: vorr d20, d18, d18
379 ; BASIC-NEXT: vld3.8 {d18[1], d19[1], d20[1]}, [r0]
380 ; BASIC-NEXT: vadd.i8 d16, d18, d19
381 ; BASIC-NEXT: vadd.i8 d16, d20, d16
382 ; BASIC-NEXT: vmov r0, r1, d16
383 ; BASIC-NEXT: mov pc, lr
384 %tmp1 = load <8 x i8>, ptr %B
385 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
386 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
387 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
388 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
389 %tmp6 = add <8 x i8> %tmp3, %tmp4
390 %tmp7 = add <8 x i8> %tmp5, %tmp6
394 ;Check the (default) alignment value. VLD3 does not support alignment.
395 define <4 x i16> @vld3lanei16(ptr %A, ptr %B) nounwind {
396 ; DEFAULT-LABEL: vld3lanei16:
398 ; DEFAULT-NEXT: vldr d16, [r1]
399 ; DEFAULT-NEXT: vorr d17, d16, d16
400 ; DEFAULT-NEXT: vorr d18, d16, d16
401 ; DEFAULT-NEXT: vld3.16 {d16[1], d17[1], d18[1]}, [r0]
402 ; DEFAULT-NEXT: vadd.i16 d20, d16, d17
403 ; DEFAULT-NEXT: vadd.i16 d16, d18, d20
404 ; DEFAULT-NEXT: vmov r0, r1, d16
405 ; DEFAULT-NEXT: mov pc, lr
407 ; BASIC-LABEL: vld3lanei16:
409 ; BASIC-NEXT: vldr d18, [r1]
410 ; BASIC-NEXT: vorr d19, d18, d18
411 ; BASIC-NEXT: vorr d20, d18, d18
412 ; BASIC-NEXT: vld3.16 {d18[1], d19[1], d20[1]}, [r0]
413 ; BASIC-NEXT: vadd.i16 d16, d18, d19
414 ; BASIC-NEXT: vadd.i16 d16, d20, d16
415 ; BASIC-NEXT: vmov r0, r1, d16
416 ; BASIC-NEXT: mov pc, lr
417 %tmp1 = load <4 x i16>, ptr %B
418 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
419 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
420 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
421 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
422 %tmp6 = add <4 x i16> %tmp3, %tmp4
423 %tmp7 = add <4 x i16> %tmp5, %tmp6
427 define <2 x i32> @vld3lanei32(ptr %A, ptr %B) nounwind {
428 ; DEFAULT-LABEL: vld3lanei32:
430 ; DEFAULT-NEXT: vldr d16, [r1]
431 ; DEFAULT-NEXT: vorr d17, d16, d16
432 ; DEFAULT-NEXT: vorr d18, d16, d16
433 ; DEFAULT-NEXT: vld3.32 {d16[1], d17[1], d18[1]}, [r0]
434 ; DEFAULT-NEXT: vadd.i32 d20, d16, d17
435 ; DEFAULT-NEXT: vadd.i32 d16, d18, d20
436 ; DEFAULT-NEXT: vmov r0, r1, d16
437 ; DEFAULT-NEXT: mov pc, lr
439 ; BASIC-LABEL: vld3lanei32:
441 ; BASIC-NEXT: vldr d18, [r1]
442 ; BASIC-NEXT: vorr d19, d18, d18
443 ; BASIC-NEXT: vorr d20, d18, d18
444 ; BASIC-NEXT: vld3.32 {d18[1], d19[1], d20[1]}, [r0]
445 ; BASIC-NEXT: vadd.i32 d16, d18, d19
446 ; BASIC-NEXT: vadd.i32 d16, d20, d16
447 ; BASIC-NEXT: vmov r0, r1, d16
448 ; BASIC-NEXT: mov pc, lr
449 %tmp1 = load <2 x i32>, ptr %B
450 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
451 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
452 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
453 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
454 %tmp6 = add <2 x i32> %tmp3, %tmp4
455 %tmp7 = add <2 x i32> %tmp5, %tmp6
459 define <2 x float> @vld3lanef(ptr %A, ptr %B) nounwind {
460 ; DEFAULT-LABEL: vld3lanef:
462 ; DEFAULT-NEXT: vldr d16, [r1]
463 ; DEFAULT-NEXT: vorr d17, d16, d16
464 ; DEFAULT-NEXT: vorr d18, d16, d16
465 ; DEFAULT-NEXT: vld3.32 {d16[1], d17[1], d18[1]}, [r0]
466 ; DEFAULT-NEXT: vadd.f32 d20, d16, d17
467 ; DEFAULT-NEXT: vadd.f32 d16, d18, d20
468 ; DEFAULT-NEXT: vmov r0, r1, d16
469 ; DEFAULT-NEXT: mov pc, lr
471 ; BASIC-LABEL: vld3lanef:
473 ; BASIC-NEXT: vldr d18, [r1]
474 ; BASIC-NEXT: vorr d19, d18, d18
475 ; BASIC-NEXT: vorr d20, d18, d18
476 ; BASIC-NEXT: vld3.32 {d18[1], d19[1], d20[1]}, [r0]
477 ; BASIC-NEXT: vadd.f32 d16, d18, d19
478 ; BASIC-NEXT: vadd.f32 d16, d20, d16
479 ; BASIC-NEXT: vmov r0, r1, d16
480 ; BASIC-NEXT: mov pc, lr
481 %tmp1 = load <2 x float>, ptr %B
482 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
483 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
484 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
485 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
486 %tmp6 = fadd <2 x float> %tmp3, %tmp4
487 %tmp7 = fadd <2 x float> %tmp5, %tmp6
488 ret <2 x float> %tmp7
491 ;Check the (default) alignment value. VLD3 does not support alignment.
492 define <8 x i16> @vld3laneQi16(ptr %A, ptr %B) nounwind {
493 ; DEFAULT-LABEL: vld3laneQi16:
495 ; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1]
496 ; DEFAULT-NEXT: vorr q9, q8, q8
497 ; DEFAULT-NEXT: vorr q10, q8, q8
498 ; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
499 ; DEFAULT-NEXT: vadd.i16 q12, q8, q9
500 ; DEFAULT-NEXT: vadd.i16 q8, q10, q12
501 ; DEFAULT-NEXT: vmov r0, r1, d16
502 ; DEFAULT-NEXT: vmov r2, r3, d17
503 ; DEFAULT-NEXT: mov pc, lr
505 ; BASIC-LABEL: vld3laneQi16:
507 ; BASIC-NEXT: vld1.64 {d18, d19}, [r1]
508 ; BASIC-NEXT: vorr q10, q9, q9
509 ; BASIC-NEXT: vorr q11, q9, q9
510 ; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0]
511 ; BASIC-NEXT: vadd.i16 q8, q9, q10
512 ; BASIC-NEXT: vadd.i16 q8, q11, q8
513 ; BASIC-NEXT: vmov r0, r1, d16
514 ; BASIC-NEXT: vmov r2, r3, d17
515 ; BASIC-NEXT: mov pc, lr
516 %tmp1 = load <8 x i16>, ptr %B
517 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
518 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
519 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
520 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
521 %tmp6 = add <8 x i16> %tmp3, %tmp4
522 %tmp7 = add <8 x i16> %tmp5, %tmp6
526 ;Check for a post-increment updating load with register increment.
527 define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind {
528 ; DEFAULT-LABEL: vld3laneQi16_update:
530 ; DEFAULT-NEXT: .save {r11, lr}
531 ; DEFAULT-NEXT: push {r11, lr}
532 ; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1]
533 ; DEFAULT-NEXT: lsl r1, r2, #1
534 ; DEFAULT-NEXT: vorr q9, q8, q8
535 ; DEFAULT-NEXT: ldr lr, [r0]
536 ; DEFAULT-NEXT: vorr q10, q8, q8
537 ; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [lr], r1
538 ; DEFAULT-NEXT: vadd.i16 q12, q8, q9
539 ; DEFAULT-NEXT: vadd.i16 q8, q10, q12
540 ; DEFAULT-NEXT: str lr, [r0]
541 ; DEFAULT-NEXT: vmov r12, r1, d16
542 ; DEFAULT-NEXT: vmov r2, r3, d17
543 ; DEFAULT-NEXT: mov r0, r12
544 ; DEFAULT-NEXT: pop {r11, lr}
545 ; DEFAULT-NEXT: mov pc, lr
547 ; BASIC-LABEL: vld3laneQi16_update:
549 ; BASIC-NEXT: .save {r11, lr}
550 ; BASIC-NEXT: push {r11, lr}
551 ; BASIC-NEXT: vld1.64 {d18, d19}, [r1]
552 ; BASIC-NEXT: mov r3, r0
553 ; BASIC-NEXT: vorr q10, q9, q9
554 ; BASIC-NEXT: lsl r1, r2, #1
555 ; BASIC-NEXT: ldr r0, [r0]
556 ; BASIC-NEXT: vorr q11, q9, q9
557 ; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0], r1
558 ; BASIC-NEXT: vadd.i16 q8, q9, q10
559 ; BASIC-NEXT: vadd.i16 q8, q11, q8
560 ; BASIC-NEXT: str r0, [r3]
561 ; BASIC-NEXT: vmov r1, lr, d16
562 ; BASIC-NEXT: vmov r2, r12, d17
563 ; BASIC-NEXT: mov r0, r1
564 ; BASIC-NEXT: mov r1, lr
565 ; BASIC-NEXT: mov r3, r12
566 ; BASIC-NEXT: pop {r11, lr}
567 ; BASIC-NEXT: mov pc, lr
568 %A = load ptr, ptr %ptr
569 %tmp1 = load <8 x i16>, ptr %B
570 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
571 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
572 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
573 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
574 %tmp6 = add <8 x i16> %tmp3, %tmp4
575 %tmp7 = add <8 x i16> %tmp5, %tmp6
576 %tmp8 = getelementptr i16, ptr %A, i32 %inc
577 store ptr %tmp8, ptr %ptr
581 define <4 x i32> @vld3laneQi32(ptr %A, ptr %B) nounwind {
582 ; DEFAULT-LABEL: vld3laneQi32:
584 ; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1]
585 ; DEFAULT-NEXT: vorr q9, q8, q8
586 ; DEFAULT-NEXT: vorr q10, q8, q8
587 ; DEFAULT-NEXT: vld3.32 {d17[1], d19[1], d21[1]}, [r0]
588 ; DEFAULT-NEXT: vadd.i32 q12, q8, q9
589 ; DEFAULT-NEXT: vadd.i32 q8, q10, q12
590 ; DEFAULT-NEXT: vmov r0, r1, d16
591 ; DEFAULT-NEXT: vmov r2, r3, d17
592 ; DEFAULT-NEXT: mov pc, lr
594 ; BASIC-LABEL: vld3laneQi32:
596 ; BASIC-NEXT: vld1.64 {d18, d19}, [r1]
597 ; BASIC-NEXT: vorr q10, q9, q9
598 ; BASIC-NEXT: vorr q11, q9, q9
599 ; BASIC-NEXT: vld3.32 {d19[1], d21[1], d23[1]}, [r0]
600 ; BASIC-NEXT: vadd.i32 q8, q9, q10
601 ; BASIC-NEXT: vadd.i32 q8, q11, q8
602 ; BASIC-NEXT: vmov r0, r1, d16
603 ; BASIC-NEXT: vmov r2, r3, d17
604 ; BASIC-NEXT: mov pc, lr
605 %tmp1 = load <4 x i32>, ptr %B
606 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
607 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
608 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
609 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
610 %tmp6 = add <4 x i32> %tmp3, %tmp4
611 %tmp7 = add <4 x i32> %tmp5, %tmp6
615 define <4 x float> @vld3laneQf(ptr %A, ptr %B) nounwind {
616 ; DEFAULT-LABEL: vld3laneQf:
618 ; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1]
619 ; DEFAULT-NEXT: vorr q9, q8, q8
620 ; DEFAULT-NEXT: vorr q10, q8, q8
621 ; DEFAULT-NEXT: vld3.32 {d16[1], d18[1], d20[1]}, [r0]
622 ; DEFAULT-NEXT: vadd.f32 q12, q8, q9
623 ; DEFAULT-NEXT: vadd.f32 q8, q10, q12
624 ; DEFAULT-NEXT: vmov r0, r1, d16
625 ; DEFAULT-NEXT: vmov r2, r3, d17
626 ; DEFAULT-NEXT: mov pc, lr
628 ; BASIC-LABEL: vld3laneQf:
630 ; BASIC-NEXT: vld1.64 {d18, d19}, [r1]
631 ; BASIC-NEXT: vorr q10, q9, q9
632 ; BASIC-NEXT: vorr q11, q9, q9
633 ; BASIC-NEXT: vld3.32 {d18[1], d20[1], d22[1]}, [r0]
634 ; BASIC-NEXT: vadd.f32 q8, q9, q10
635 ; BASIC-NEXT: vadd.f32 q8, q11, q8
636 ; BASIC-NEXT: vmov r0, r1, d16
637 ; BASIC-NEXT: vmov r2, r3, d17
638 ; BASIC-NEXT: mov pc, lr
639 %tmp1 = load <4 x float>, ptr %B
640 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
641 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
642 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
643 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
644 %tmp6 = fadd <4 x float> %tmp3, %tmp4
645 %tmp7 = fadd <4 x float> %tmp5, %tmp6
646 ret <4 x float> %tmp7
649 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
650 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
651 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
652 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0(ptr, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
654 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
655 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
656 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0(ptr, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
658 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
659 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
660 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
661 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
663 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
664 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
665 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
667 ;Check the alignment value. Max for this instruction is 32 bits:
668 define <8 x i8> @vld4lanei8(ptr %A, ptr %B) nounwind {
669 ; CHECK-LABEL: vld4lanei8:
671 ; CHECK-NEXT: vldr d16, [r1]
672 ; CHECK-NEXT: vorr d17, d16, d16
673 ; CHECK-NEXT: vorr d18, d16, d16
674 ; CHECK-NEXT: vorr d19, d16, d16
675 ; CHECK-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
676 ; CHECK-NEXT: vadd.i8 d16, d16, d17
677 ; CHECK-NEXT: vadd.i8 d20, d18, d19
678 ; CHECK-NEXT: vadd.i8 d16, d16, d20
679 ; CHECK-NEXT: vmov r0, r1, d16
680 ; CHECK-NEXT: mov pc, lr
681 %tmp1 = load <8 x i8>, ptr %B
682 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
683 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
684 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
685 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
686 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
687 %tmp7 = add <8 x i8> %tmp3, %tmp4
688 %tmp8 = add <8 x i8> %tmp5, %tmp6
689 %tmp9 = add <8 x i8> %tmp7, %tmp8
693 ;Check for a post-increment updating load.
694 define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind {
695 ; DEFAULT-LABEL: vld4lanei8_update:
697 ; DEFAULT-NEXT: vldr d16, [r1]
698 ; DEFAULT-NEXT: vorr d17, d16, d16
699 ; DEFAULT-NEXT: ldr r3, [r0]
700 ; DEFAULT-NEXT: vorr d18, d16, d16
701 ; DEFAULT-NEXT: vorr d19, d16, d16
702 ; DEFAULT-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3:32]!
703 ; DEFAULT-NEXT: vadd.i8 d16, d16, d17
704 ; DEFAULT-NEXT: vadd.i8 d20, d18, d19
705 ; DEFAULT-NEXT: str r3, [r0]
706 ; DEFAULT-NEXT: vadd.i8 d16, d16, d20
707 ; DEFAULT-NEXT: vmov r2, r1, d16
708 ; DEFAULT-NEXT: mov r0, r2
709 ; DEFAULT-NEXT: mov pc, lr
711 ; BASIC-LABEL: vld4lanei8_update:
713 ; BASIC-NEXT: vldr d16, [r1]
714 ; BASIC-NEXT: mov r3, r0
715 ; BASIC-NEXT: vorr d17, d16, d16
716 ; BASIC-NEXT: ldr r0, [r0]
717 ; BASIC-NEXT: vorr d18, d16, d16
718 ; BASIC-NEXT: vorr d19, d16, d16
719 ; BASIC-NEXT: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]!
720 ; BASIC-NEXT: vadd.i8 d16, d16, d17
721 ; BASIC-NEXT: vadd.i8 d20, d18, d19
722 ; BASIC-NEXT: str r0, [r3]
723 ; BASIC-NEXT: vadd.i8 d16, d16, d20
724 ; BASIC-NEXT: vmov r1, r2, d16
725 ; BASIC-NEXT: mov r0, r1
726 ; BASIC-NEXT: mov r1, r2
727 ; BASIC-NEXT: mov pc, lr
728 %A = load ptr, ptr %ptr
729 %tmp1 = load <8 x i8>, ptr %B
730 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
731 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
732 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
733 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
734 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
735 %tmp7 = add <8 x i8> %tmp3, %tmp4
736 %tmp8 = add <8 x i8> %tmp5, %tmp6
737 %tmp9 = add <8 x i8> %tmp7, %tmp8
738 %tmp10 = getelementptr i8, ptr %A, i32 4
739 store ptr %tmp10, ptr %ptr
743 ;Check that a power-of-two alignment smaller than the total size of the memory
744 ;being loaded is ignored.
745 define <4 x i16> @vld4lanei16(ptr %A, ptr %B) nounwind {
746 ; CHECK-LABEL: vld4lanei16:
748 ; CHECK-NEXT: vldr d16, [r1]
749 ; CHECK-NEXT: vorr d17, d16, d16
750 ; CHECK-NEXT: vorr d18, d16, d16
751 ; CHECK-NEXT: vorr d19, d16, d16
752 ; CHECK-NEXT: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
753 ; CHECK-NEXT: vadd.i16 d16, d16, d17
754 ; CHECK-NEXT: vadd.i16 d20, d18, d19
755 ; CHECK-NEXT: vadd.i16 d16, d16, d20
756 ; CHECK-NEXT: vmov r0, r1, d16
757 ; CHECK-NEXT: mov pc, lr
758 %tmp1 = load <4 x i16>, ptr %B
759 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
760 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
761 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
762 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
763 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
764 %tmp7 = add <4 x i16> %tmp3, %tmp4
765 %tmp8 = add <4 x i16> %tmp5, %tmp6
766 %tmp9 = add <4 x i16> %tmp7, %tmp8
770 ;Check the alignment value. An 8-byte alignment is allowed here even though
771 ;it is smaller than the total size of the memory being loaded.
772 define <2 x i32> @vld4lanei32(ptr %A, ptr %B) nounwind {
773 ; CHECK-LABEL: vld4lanei32:
775 ; CHECK-NEXT: vldr d16, [r1]
776 ; CHECK-NEXT: vorr d17, d16, d16
777 ; CHECK-NEXT: vorr d18, d16, d16
778 ; CHECK-NEXT: vorr d19, d16, d16
779 ; CHECK-NEXT: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:64]
780 ; CHECK-NEXT: vadd.i32 d16, d16, d17
781 ; CHECK-NEXT: vadd.i32 d20, d18, d19
782 ; CHECK-NEXT: vadd.i32 d16, d16, d20
783 ; CHECK-NEXT: vmov r0, r1, d16
784 ; CHECK-NEXT: mov pc, lr
785 %tmp1 = load <2 x i32>, ptr %B
786 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
787 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
788 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
789 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
790 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
791 %tmp7 = add <2 x i32> %tmp3, %tmp4
792 %tmp8 = add <2 x i32> %tmp5, %tmp6
793 %tmp9 = add <2 x i32> %tmp7, %tmp8
797 define <2 x float> @vld4lanef(ptr %A, ptr %B) nounwind {
798 ; CHECK-LABEL: vld4lanef:
800 ; CHECK-NEXT: vldr d16, [r1]
801 ; CHECK-NEXT: vorr d17, d16, d16
802 ; CHECK-NEXT: vorr d18, d16, d16
803 ; CHECK-NEXT: vorr d19, d16, d16
804 ; CHECK-NEXT: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
805 ; CHECK-NEXT: vadd.f32 d16, d16, d17
806 ; CHECK-NEXT: vadd.f32 d20, d18, d19
807 ; CHECK-NEXT: vadd.f32 d16, d16, d20
808 ; CHECK-NEXT: vmov r0, r1, d16
809 ; CHECK-NEXT: mov pc, lr
810 %tmp1 = load <2 x float>, ptr %B
811 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
812 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
813 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
814 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
815 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
816 %tmp7 = fadd <2 x float> %tmp3, %tmp4
817 %tmp8 = fadd <2 x float> %tmp5, %tmp6
818 %tmp9 = fadd <2 x float> %tmp7, %tmp8
819 ret <2 x float> %tmp9
822 ;Check the alignment value. Max for this instruction is 64 bits:
823 define <8 x i16> @vld4laneQi16(ptr %A, ptr %B) nounwind {
824 ; CHECK-LABEL: vld4laneQi16:
826 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
827 ; CHECK-NEXT: vorr q9, q8, q8
828 ; CHECK-NEXT: vorr q10, q8, q8
829 ; CHECK-NEXT: vorr q11, q8, q8
830 ; CHECK-NEXT: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0:64]
831 ; CHECK-NEXT: vadd.i16 q8, q8, q9
832 ; CHECK-NEXT: vadd.i16 q12, q10, q11
833 ; CHECK-NEXT: vadd.i16 q8, q8, q12
834 ; CHECK-NEXT: vmov r0, r1, d16
835 ; CHECK-NEXT: vmov r2, r3, d17
836 ; CHECK-NEXT: mov pc, lr
837 %tmp1 = load <8 x i16>, ptr %B
838 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
839 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
840 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
841 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
842 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
843 %tmp7 = add <8 x i16> %tmp3, %tmp4
844 %tmp8 = add <8 x i16> %tmp5, %tmp6
845 %tmp9 = add <8 x i16> %tmp7, %tmp8
849 ;Check the (default) alignment.
850 define <4 x i32> @vld4laneQi32(ptr %A, ptr %B) nounwind {
851 ; CHECK-LABEL: vld4laneQi32:
853 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
854 ; CHECK-NEXT: vorr q9, q8, q8
855 ; CHECK-NEXT: vorr q10, q8, q8
856 ; CHECK-NEXT: vorr q11, q8, q8
857 ; CHECK-NEXT: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
858 ; CHECK-NEXT: vadd.i32 q8, q8, q9
859 ; CHECK-NEXT: vadd.i32 q12, q10, q11
860 ; CHECK-NEXT: vadd.i32 q8, q8, q12
861 ; CHECK-NEXT: vmov r0, r1, d16
862 ; CHECK-NEXT: vmov r2, r3, d17
863 ; CHECK-NEXT: mov pc, lr
864 %tmp1 = load <4 x i32>, ptr %B
865 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
866 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
867 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
868 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
869 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
870 %tmp7 = add <4 x i32> %tmp3, %tmp4
871 %tmp8 = add <4 x i32> %tmp5, %tmp6
872 %tmp9 = add <4 x i32> %tmp7, %tmp8
876 define <4 x float> @vld4laneQf(ptr %A, ptr %B) nounwind {
877 ; CHECK-LABEL: vld4laneQf:
879 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
880 ; CHECK-NEXT: vorr q9, q8, q8
881 ; CHECK-NEXT: vorr q10, q8, q8
882 ; CHECK-NEXT: vorr q11, q8, q8
883 ; CHECK-NEXT: vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0]
884 ; CHECK-NEXT: vadd.f32 q8, q8, q9
885 ; CHECK-NEXT: vadd.f32 q12, q10, q11
886 ; CHECK-NEXT: vadd.f32 q8, q8, q12
887 ; CHECK-NEXT: vmov r0, r1, d16
888 ; CHECK-NEXT: vmov r2, r3, d17
889 ; CHECK-NEXT: mov pc, lr
890 %tmp1 = load <4 x float>, ptr %B
891 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
892 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
893 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
894 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
895 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
896 %tmp7 = fadd <4 x float> %tmp3, %tmp4
897 %tmp8 = fadd <4 x float> %tmp5, %tmp6
898 %tmp9 = fadd <4 x float> %tmp7, %tmp8
899 ret <4 x float> %tmp9
902 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
903 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
904 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
905 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0(ptr, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
907 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
908 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
909 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0(ptr, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
911 ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
912 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
913 ; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low
914 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
915 define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
916 ; DEFAULT-LABEL: test_qqqq_regsequence_subreg:
918 ; DEFAULT-NEXT: add r0, sp, #24
919 ; DEFAULT-NEXT: vld1.32 {d21[0]}, [r0:32]
920 ; DEFAULT-NEXT: add r0, sp, #28
921 ; DEFAULT-NEXT: vmov.i32 d20, #0x0
922 ; DEFAULT-NEXT: vld1.32 {d21[1]}, [r0:32]
923 ; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
924 ; DEFAULT-NEXT: vadd.i16 q12, q8, q9
925 ; DEFAULT-NEXT: vadd.i16 q8, q10, q12
926 ; DEFAULT-NEXT: vmov r0, r1, d16
927 ; DEFAULT-NEXT: vmov r2, r3, d17
928 ; DEFAULT-NEXT: mov pc, lr
930 ; BASIC-LABEL: test_qqqq_regsequence_subreg:
932 ; BASIC-NEXT: add r0, sp, #24
933 ; BASIC-NEXT: vld1.32 {d23[0]}, [r0:32]
934 ; BASIC-NEXT: add r0, sp, #28
935 ; BASIC-NEXT: vmov.i32 d22, #0x0
936 ; BASIC-NEXT: vld1.32 {d23[1]}, [r0:32]
937 ; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0]
938 ; BASIC-NEXT: vadd.i16 q8, q9, q10
939 ; BASIC-NEXT: vadd.i16 q8, q11, q8
940 ; BASIC-NEXT: vmov r0, r1, d16
941 ; BASIC-NEXT: vmov r2, r3, d17
942 ; BASIC-NEXT: mov pc, lr
943 %tmp63 = extractvalue [6 x i64] %b, 5
944 %tmp64 = zext i64 %tmp63 to i128
945 %tmp65 = shl i128 %tmp64, 64
946 %ins67 = or i128 %tmp65, 0
947 %tmp78 = bitcast i128 %ins67 to <8 x i16>
948 %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
949 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
950 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
951 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
952 %tmp6 = add <8 x i16> %tmp3, %tmp4
953 %tmp7 = add <8 x i16> %tmp5, %tmp6
957 declare void @llvm.trap() nounwind