1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
3 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
5 ;Check the (default) alignment value.
6 ;CHECK: vld1.8 {d16[3]}, [r0]
7 %tmp1 = load <8 x i8>* %B
8 %tmp2 = load i8* %A, align 8
9 %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
13 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
15 ;Check the alignment value. Max for this instruction is 16 bits:
16 ;CHECK: vld1.16 {d16[2]}, [r0, :16]
17 %tmp1 = load <4 x i16>* %B
18 %tmp2 = load i16* %A, align 8
19 %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
23 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
25 ;Check the alignment value. Max for this instruction is 32 bits:
26 ;CHECK: vld1.32 {d16[1]}, [r0, :32]
27 %tmp1 = load <2 x i32>* %B
28 %tmp2 = load i32* %A, align 8
29 %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
33 define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
35 ;CHECK: vld1.8 {d17[1]}, [r0]
36 %tmp1 = load <16 x i8>* %B
37 %tmp2 = load i8* %A, align 8
38 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
42 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
44 ;CHECK: vld1.16 {d17[1]}, [r0, :16]
45 %tmp1 = load <8 x i16>* %B
46 %tmp2 = load i16* %A, align 8
47 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
51 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
53 ;CHECK: vld1.32 {d17[1]}, [r0, :32]
54 %tmp1 = load <4 x i32>* %B
55 %tmp2 = load i32* %A, align 8
56 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
60 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
61 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
62 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
63 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
65 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
66 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
67 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
69 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
71 ;Check the alignment value. Max for this instruction is 16 bits:
72 ;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16]
73 %tmp1 = load <8 x i8>* %B
74 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
75 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
76 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
77 %tmp5 = add <8 x i8> %tmp3, %tmp4
81 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
83 ;Check the alignment value. Max for this instruction is 32 bits:
84 ;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32]
85 %tmp0 = bitcast i16* %A to i8*
86 %tmp1 = load <4 x i16>* %B
87 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
88 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
89 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
90 %tmp5 = add <4 x i16> %tmp3, %tmp4
94 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
97 %tmp0 = bitcast i32* %A to i8*
98 %tmp1 = load <2 x i32>* %B
99 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
100 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
101 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
102 %tmp5 = add <2 x i32> %tmp3, %tmp4
106 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
109 %tmp0 = bitcast float* %A to i8*
110 %tmp1 = load <2 x float>* %B
111 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
112 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
113 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
114 %tmp5 = fadd <2 x float> %tmp3, %tmp4
115 ret <2 x float> %tmp5
118 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
119 ;CHECK: vld2laneQi16:
120 ;Check the (default) alignment.
121 ;CHECK: vld2.16 {d17[1], d19[1]}, [r0]
122 %tmp0 = bitcast i16* %A to i8*
123 %tmp1 = load <8 x i16>* %B
124 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
125 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
126 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
127 %tmp5 = add <8 x i16> %tmp3, %tmp4
131 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
132 ;CHECK: vld2laneQi32:
133 ;Check the alignment value. Max for this instruction is 64 bits:
134 ;CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64]
135 %tmp0 = bitcast i32* %A to i8*
136 %tmp1 = load <4 x i32>* %B
137 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
138 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
139 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
140 %tmp5 = add <4 x i32> %tmp3, %tmp4
144 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
147 %tmp0 = bitcast float* %A to i8*
148 %tmp1 = load <4 x float>* %B
149 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
150 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
151 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
152 %tmp5 = fadd <4 x float> %tmp3, %tmp4
153 ret <4 x float> %tmp5
156 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
157 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
158 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
159 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
161 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
162 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
163 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
165 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
166 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
167 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
168 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
170 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
171 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
172 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
174 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
177 %tmp1 = load <8 x i8>* %B
178 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
179 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
180 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
181 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
182 %tmp6 = add <8 x i8> %tmp3, %tmp4
183 %tmp7 = add <8 x i8> %tmp5, %tmp6
187 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
189 ;Check the (default) alignment value. VLD3 does not support alignment.
190 ;CHECK: vld3.16 {d16[1], d17[1], d18[1]}, [r0]
191 %tmp0 = bitcast i16* %A to i8*
192 %tmp1 = load <4 x i16>* %B
193 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
194 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
195 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
196 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
197 %tmp6 = add <4 x i16> %tmp3, %tmp4
198 %tmp7 = add <4 x i16> %tmp5, %tmp6
202 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
205 %tmp0 = bitcast i32* %A to i8*
206 %tmp1 = load <2 x i32>* %B
207 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
208 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
209 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
210 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
211 %tmp6 = add <2 x i32> %tmp3, %tmp4
212 %tmp7 = add <2 x i32> %tmp5, %tmp6
216 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
219 %tmp0 = bitcast float* %A to i8*
220 %tmp1 = load <2 x float>* %B
221 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
222 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
223 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
224 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
225 %tmp6 = fadd <2 x float> %tmp3, %tmp4
226 %tmp7 = fadd <2 x float> %tmp5, %tmp6
227 ret <2 x float> %tmp7
230 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
231 ;CHECK: vld3laneQi16:
232 ;Check the (default) alignment value. VLD3 does not support alignment.
233 ;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
234 %tmp0 = bitcast i16* %A to i8*
235 %tmp1 = load <8 x i16>* %B
236 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
237 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
238 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
239 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
240 %tmp6 = add <8 x i16> %tmp3, %tmp4
241 %tmp7 = add <8 x i16> %tmp5, %tmp6
245 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
246 ;CHECK: vld3laneQi32:
248 %tmp0 = bitcast i32* %A to i8*
249 %tmp1 = load <4 x i32>* %B
250 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
251 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
252 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
253 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
254 %tmp6 = add <4 x i32> %tmp3, %tmp4
255 %tmp7 = add <4 x i32> %tmp5, %tmp6
259 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
262 %tmp0 = bitcast float* %A to i8*
263 %tmp1 = load <4 x float>* %B
264 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
265 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
266 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
267 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
268 %tmp6 = fadd <4 x float> %tmp3, %tmp4
269 %tmp7 = fadd <4 x float> %tmp5, %tmp6
270 ret <4 x float> %tmp7
273 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
274 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
275 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
276 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
278 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
279 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
280 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
282 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
283 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
284 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
285 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
287 %struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
288 %struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
289 %struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
291 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
293 ;Check the alignment value. Max for this instruction is 32 bits:
294 ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
295 %tmp1 = load <8 x i8>* %B
296 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
297 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
298 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
299 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
300 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
301 %tmp7 = add <8 x i8> %tmp3, %tmp4
302 %tmp8 = add <8 x i8> %tmp5, %tmp6
303 %tmp9 = add <8 x i8> %tmp7, %tmp8
307 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
310 %tmp0 = bitcast i16* %A to i8*
311 %tmp1 = load <4 x i16>* %B
312 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
313 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
314 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
315 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
316 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
317 %tmp7 = add <4 x i16> %tmp3, %tmp4
318 %tmp8 = add <4 x i16> %tmp5, %tmp6
319 %tmp9 = add <4 x i16> %tmp7, %tmp8
323 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
325 ;Check the alignment value. Max for this instruction is 128 bits:
326 ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
327 %tmp0 = bitcast i32* %A to i8*
328 %tmp1 = load <2 x i32>* %B
329 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
330 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
331 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
332 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
333 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
334 %tmp7 = add <2 x i32> %tmp3, %tmp4
335 %tmp8 = add <2 x i32> %tmp5, %tmp6
336 %tmp9 = add <2 x i32> %tmp7, %tmp8
340 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
343 %tmp0 = bitcast float* %A to i8*
344 %tmp1 = load <2 x float>* %B
345 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
346 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
347 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
348 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
349 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
350 %tmp7 = fadd <2 x float> %tmp3, %tmp4
351 %tmp8 = fadd <2 x float> %tmp5, %tmp6
352 %tmp9 = fadd <2 x float> %tmp7, %tmp8
353 ret <2 x float> %tmp9
356 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
357 ;CHECK: vld4laneQi16:
358 ;Check the alignment value. Max for this instruction is 64 bits:
359 ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
360 %tmp0 = bitcast i16* %A to i8*
361 %tmp1 = load <8 x i16>* %B
362 %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
363 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
364 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
365 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
366 %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
367 %tmp7 = add <8 x i16> %tmp3, %tmp4
368 %tmp8 = add <8 x i16> %tmp5, %tmp6
369 %tmp9 = add <8 x i16> %tmp7, %tmp8
373 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
374 ;CHECK: vld4laneQi32:
375 ;Check the (default) alignment.
376 ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
377 %tmp0 = bitcast i32* %A to i8*
378 %tmp1 = load <4 x i32>* %B
379 %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
380 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
381 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
382 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
383 %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
384 %tmp7 = add <4 x i32> %tmp3, %tmp4
385 %tmp8 = add <4 x i32> %tmp5, %tmp6
386 %tmp9 = add <4 x i32> %tmp7, %tmp8
390 define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
393 %tmp0 = bitcast float* %A to i8*
394 %tmp1 = load <4 x float>* %B
395 %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
396 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
397 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
398 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
399 %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
400 %tmp7 = fadd <4 x float> %tmp3, %tmp4
401 %tmp8 = fadd <4 x float> %tmp5, %tmp6
402 %tmp9 = fadd <4 x float> %tmp7, %tmp8
403 ret <4 x float> %tmp9
406 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
407 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
408 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
409 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
411 declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
412 declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
413 declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly