Revert "Reland '[flang] Allow to pass an async id to allocate the descriptor (#118713...
[llvm-project.git] / clang / test / CodeGen / AArch64 / neon-ldst-one.c
blobb57df40d8e5c9ee175dc21df9d6446a901c63183
1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
3 // RUN: | opt -S -passes=mem2reg | FileCheck %s
5 // REQUIRES: aarch64-registered-target || arm-registered-target
7 #include <arm_neon.h>
9 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 {
10 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
11 // CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
12 // CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
13 // CHECK: ret <16 x i8> [[LANE]]
14 uint8x16_t test_vld1q_dup_u8(uint8_t *a) {
15 return vld1q_dup_u8(a);
18 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 {
19 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
20 // CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
21 // CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
22 // CHECK: ret <8 x i16> [[LANE]]
23 uint16x8_t test_vld1q_dup_u16(uint16_t *a) {
24 return vld1q_dup_u16(a);
27 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 {
28 // CHECK: [[TMP2:%.*]] = load i32, ptr %a
29 // CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
30 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
31 // CHECK: ret <4 x i32> [[LANE]]
32 uint32x4_t test_vld1q_dup_u32(uint32_t *a) {
33 return vld1q_dup_u32(a);
36 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 {
37 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
38 // CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
39 // CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
40 // CHECK: ret <2 x i64> [[LANE]]
41 uint64x2_t test_vld1q_dup_u64(uint64_t *a) {
42 return vld1q_dup_u64(a);
45 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 {
46 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
47 // CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
48 // CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
49 // CHECK: ret <16 x i8> [[LANE]]
50 int8x16_t test_vld1q_dup_s8(int8_t *a) {
51 return vld1q_dup_s8(a);
54 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 {
55 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
56 // CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
57 // CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
58 // CHECK: ret <8 x i16> [[LANE]]
59 int16x8_t test_vld1q_dup_s16(int16_t *a) {
60 return vld1q_dup_s16(a);
63 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 {
64 // CHECK: [[TMP2:%.*]] = load i32, ptr %a
65 // CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
66 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
67 // CHECK: ret <4 x i32> [[LANE]]
68 int32x4_t test_vld1q_dup_s32(int32_t *a) {
69 return vld1q_dup_s32(a);
72 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 {
73 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
74 // CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
75 // CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
76 // CHECK: ret <2 x i64> [[LANE]]
77 int64x2_t test_vld1q_dup_s64(int64_t *a) {
78 return vld1q_dup_s64(a);
81 // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 {
82 // CHECK: [[TMP2:%.*]] = load half, ptr %a
83 // CHECK: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
84 // CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
85 // CHECK: ret <8 x half> [[LANE]]
86 float16x8_t test_vld1q_dup_f16(float16_t *a) {
87 return vld1q_dup_f16(a);
90 // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 {
91 // CHECK: [[TMP2:%.*]] = load float, ptr %a
92 // CHECK: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
93 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
94 // CHECK: ret <4 x float> [[LANE]]
95 float32x4_t test_vld1q_dup_f32(float32_t *a) {
96 return vld1q_dup_f32(a);
99 // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 {
100 // CHECK: [[TMP2:%.*]] = load double, ptr %a
101 // CHECK: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
102 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
103 // CHECK: ret <2 x double> [[LANE]]
104 float64x2_t test_vld1q_dup_f64(float64_t *a) {
105 return vld1q_dup_f64(a);
108 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 {
109 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
110 // CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
111 // CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
112 // CHECK: ret <16 x i8> [[LANE]]
113 poly8x16_t test_vld1q_dup_p8(poly8_t *a) {
114 return vld1q_dup_p8(a);
117 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 {
118 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
119 // CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
120 // CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
121 // CHECK: ret <8 x i16> [[LANE]]
122 poly16x8_t test_vld1q_dup_p16(poly16_t *a) {
123 return vld1q_dup_p16(a);
126 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 {
127 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
128 // CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
129 // CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
130 // CHECK: ret <2 x i64> [[LANE]]
131 poly64x2_t test_vld1q_dup_p64(poly64_t *a) {
132 return vld1q_dup_p64(a);
135 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 {
136 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
137 // CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
138 // CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
139 // CHECK: ret <8 x i8> [[LANE]]
140 uint8x8_t test_vld1_dup_u8(uint8_t *a) {
141 return vld1_dup_u8(a);
144 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 {
145 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
146 // CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
147 // CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
148 // CHECK: ret <4 x i16> [[LANE]]
149 uint16x4_t test_vld1_dup_u16(uint16_t *a) {
150 return vld1_dup_u16(a);
153 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 {
154 // CHECK: [[TMP2:%.*]] = load i32, ptr %a
155 // CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
156 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
157 // CHECK: ret <2 x i32> [[LANE]]
158 uint32x2_t test_vld1_dup_u32(uint32_t *a) {
159 return vld1_dup_u32(a);
162 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 {
163 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
164 // CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
165 // CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
166 // CHECK: ret <1 x i64> [[LANE]]
167 uint64x1_t test_vld1_dup_u64(uint64_t *a) {
168 return vld1_dup_u64(a);
171 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 {
172 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
173 // CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
174 // CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
175 // CHECK: ret <8 x i8> [[LANE]]
176 int8x8_t test_vld1_dup_s8(int8_t *a) {
177 return vld1_dup_s8(a);
180 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 {
181 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
182 // CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
183 // CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
184 // CHECK: ret <4 x i16> [[LANE]]
185 int16x4_t test_vld1_dup_s16(int16_t *a) {
186 return vld1_dup_s16(a);
189 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 {
190 // CHECK: [[TMP2:%.*]] = load i32, ptr %a
191 // CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
192 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
193 // CHECK: ret <2 x i32> [[LANE]]
194 int32x2_t test_vld1_dup_s32(int32_t *a) {
195 return vld1_dup_s32(a);
198 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 {
199 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
200 // CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
201 // CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
202 // CHECK: ret <1 x i64> [[LANE]]
203 int64x1_t test_vld1_dup_s64(int64_t *a) {
204 return vld1_dup_s64(a);
207 // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 {
208 // CHECK: [[TMP2:%.*]] = load half, ptr %a
209 // CHECK: [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
210 // CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
211 // CHECK: ret <4 x half> [[LANE]]
212 float16x4_t test_vld1_dup_f16(float16_t *a) {
213 return vld1_dup_f16(a);
216 // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 {
217 // CHECK: [[TMP2:%.*]] = load float, ptr %a
218 // CHECK: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
219 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
220 // CHECK: ret <2 x float> [[LANE]]
221 float32x2_t test_vld1_dup_f32(float32_t *a) {
222 return vld1_dup_f32(a);
225 // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 {
226 // CHECK: [[TMP2:%.*]] = load double, ptr %a
227 // CHECK: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
228 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
229 // CHECK: ret <1 x double> [[LANE]]
230 float64x1_t test_vld1_dup_f64(float64_t *a) {
231 return vld1_dup_f64(a);
234 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 {
235 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
236 // CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
237 // CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
238 // CHECK: ret <8 x i8> [[LANE]]
239 poly8x8_t test_vld1_dup_p8(poly8_t *a) {
240 return vld1_dup_p8(a);
243 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 {
244 // CHECK: [[TMP2:%.*]] = load i16, ptr %a
245 // CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
246 // CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
247 // CHECK: ret <4 x i16> [[LANE]]
248 poly16x4_t test_vld1_dup_p16(poly16_t *a) {
249 return vld1_dup_p16(a);
252 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 {
253 // CHECK: [[TMP2:%.*]] = load i64, ptr %a
254 // CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
255 // CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
256 // CHECK: ret <1 x i64> [[LANE]]
257 poly64x1_t test_vld1_dup_p64(poly64_t *a) {
258 return vld1_dup_p64(a);
261 // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 {
262 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
263 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
264 // CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
265 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
266 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
267 // CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
268 // CHECK: ret %struct.uint64x2x2_t [[TMP6]]
269 uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) {
270 return vld2q_dup_u64(a);
273 // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 {
274 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
275 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
276 // CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
277 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
278 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
279 // CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
280 // CHECK: ret %struct.int64x2x2_t [[TMP6]]
281 int64x2x2_t test_vld2q_dup_s64(int64_t *a) {
282 return vld2q_dup_s64(a);
285 // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 {
286 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
287 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
288 // CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a)
289 // CHECK: store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]]
290 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
291 // CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
292 // CHECK: ret %struct.float64x2x2_t [[TMP6]]
293 float64x2x2_t test_vld2q_dup_f64(float64_t *a) {
294 return vld2q_dup_f64(a);
297 // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 {
298 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
299 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
300 // CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
301 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
302 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
303 // CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
304 // CHECK: ret %struct.poly64x2x2_t [[TMP6]]
305 poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) {
306 return vld2q_dup_p64(a);
309 // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 {
310 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
311 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
312 // CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a)
313 // CHECK: store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]]
314 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
315 // CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
316 // CHECK: ret %struct.float64x1x2_t [[TMP6]]
317 float64x1x2_t test_vld2_dup_f64(float64_t *a) {
318 return vld2_dup_f64(a);
321 // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 {
322 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
323 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
324 // CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a)
325 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
326 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
327 // CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
328 // CHECK: ret %struct.poly64x1x2_t [[TMP6]]
329 poly64x1x2_t test_vld2_dup_p64(poly64_t *a) {
330 return vld2_dup_p64(a);
333 // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 {
334 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
335 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
336 // CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
337 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
338 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
339 // CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
340 // CHECK: ret %struct.uint64x2x3_t [[TMP6]]
341 uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) {
342 return vld3q_dup_u64(a);
343 // [{{x[0-9]+|sp}}]
346 // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 {
347 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
348 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
349 // CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
350 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
351 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
352 // CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
353 // CHECK: ret %struct.int64x2x3_t [[TMP6]]
354 int64x2x3_t test_vld3q_dup_s64(int64_t *a) {
355 return vld3q_dup_s64(a);
356 // [{{x[0-9]+|sp}}]
359 // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 {
360 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
361 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
362 // CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a)
363 // CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]]
364 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
365 // CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
366 // CHECK: ret %struct.float64x2x3_t [[TMP6]]
367 float64x2x3_t test_vld3q_dup_f64(float64_t *a) {
368 return vld3q_dup_f64(a);
369 // [{{x[0-9]+|sp}}]
372 // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 {
373 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
374 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
375 // CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
376 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
377 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
378 // CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
379 // CHECK: ret %struct.poly64x2x3_t [[TMP6]]
380 poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) {
381 return vld3q_dup_p64(a);
382 // [{{x[0-9]+|sp}}]
385 // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 {
386 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
387 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
388 // CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a)
389 // CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]]
390 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
391 // CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
392 // CHECK: ret %struct.float64x1x3_t [[TMP6]]
393 float64x1x3_t test_vld3_dup_f64(float64_t *a) {
394 return vld3_dup_f64(a);
395 // [{{x[0-9]+|sp}}]
398 // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 {
399 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
400 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
401 // CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a)
402 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
403 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
404 // CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
405 // CHECK: ret %struct.poly64x1x3_t [[TMP6]]
406 poly64x1x3_t test_vld3_dup_p64(poly64_t *a) {
407 return vld3_dup_p64(a);
408 // [{{x[0-9]+|sp}}]
411 // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 {
412 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
413 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
414 // CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
415 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
416 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
417 // CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
418 // CHECK: ret %struct.uint64x2x4_t [[TMP6]]
419 uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) {
420 return vld4q_dup_u64(a);
423 // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 {
424 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
425 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
426 // CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
427 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
428 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
429 // CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
430 // CHECK: ret %struct.int64x2x4_t [[TMP6]]
431 int64x2x4_t test_vld4q_dup_s64(int64_t *a) {
432 return vld4q_dup_s64(a);
435 // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 {
436 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
437 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
438 // CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a)
439 // CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]]
440 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
441 // CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
442 // CHECK: ret %struct.float64x2x4_t [[TMP6]]
443 float64x2x4_t test_vld4q_dup_f64(float64_t *a) {
444 return vld4q_dup_f64(a);
447 // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 {
448 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
449 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
450 // CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
451 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
452 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
453 // CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
454 // CHECK: ret %struct.poly64x2x4_t [[TMP6]]
455 poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) {
456 return vld4q_dup_p64(a);
459 // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 {
460 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
461 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
462 // CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a)
463 // CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]]
464 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
465 // CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
466 // CHECK: ret %struct.float64x1x4_t [[TMP6]]
467 float64x1x4_t test_vld4_dup_f64(float64_t *a) {
468 return vld4_dup_f64(a);
471 // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 {
472 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
473 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
474 // CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a)
475 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
476 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
477 // CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
478 // CHECK: ret %struct.poly64x1x4_t [[TMP6]]
479 poly64x1x4_t test_vld4_dup_p64(poly64_t *a) {
480 return vld4_dup_p64(a);
483 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
484 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
485 // CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
486 // CHECK: ret <16 x i8> [[VLD1_LANE]]
487 uint8x16_t test_vld1q_lane_u8(uint8_t *a, uint8x16_t b) {
488 return vld1q_lane_u8(a, b, 15);
491 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
492 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
493 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
494 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
495 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
496 // CHECK: ret <8 x i16> [[VLD1_LANE]]
497 uint16x8_t test_vld1q_lane_u16(uint16_t *a, uint16x8_t b) {
498 return vld1q_lane_u16(a, b, 7);
501 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
502 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
503 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
504 // CHECK: [[TMP4:%.*]] = load i32, ptr %a
505 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
506 // CHECK: ret <4 x i32> [[VLD1_LANE]]
507 uint32x4_t test_vld1q_lane_u32(uint32_t *a, uint32x4_t b) {
508 return vld1q_lane_u32(a, b, 3);
511 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
512 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
513 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
514 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
515 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
516 // CHECK: ret <2 x i64> [[VLD1_LANE]]
517 uint64x2_t test_vld1q_lane_u64(uint64_t *a, uint64x2_t b) {
518 return vld1q_lane_u64(a, b, 1);
521 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
522 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
523 // CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
524 // CHECK: ret <16 x i8> [[VLD1_LANE]]
525 int8x16_t test_vld1q_lane_s8(int8_t *a, int8x16_t b) {
526 return vld1q_lane_s8(a, b, 15);
529 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
530 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
531 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
532 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
533 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
534 // CHECK: ret <8 x i16> [[VLD1_LANE]]
535 int16x8_t test_vld1q_lane_s16(int16_t *a, int16x8_t b) {
536 return vld1q_lane_s16(a, b, 7);
539 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
540 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
541 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
542 // CHECK: [[TMP4:%.*]] = load i32, ptr %a
543 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
544 // CHECK: ret <4 x i32> [[VLD1_LANE]]
545 int32x4_t test_vld1q_lane_s32(int32_t *a, int32x4_t b) {
546 return vld1q_lane_s32(a, b, 3);
549 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
550 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
551 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
552 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
553 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
554 // CHECK: ret <2 x i64> [[VLD1_LANE]]
555 int64x2_t test_vld1q_lane_s64(int64_t *a, int64x2_t b) {
556 return vld1q_lane_s64(a, b, 1);
559 // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
560 // CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
561 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
562 // CHECK: [[TMP4:%.*]] = load half, ptr %a
563 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
564 // CHECK: ret <8 x half> [[VLD1_LANE]]
565 float16x8_t test_vld1q_lane_f16(float16_t *a, float16x8_t b) {
566 return vld1q_lane_f16(a, b, 7);
569 // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
570 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
571 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
572 // CHECK: [[TMP4:%.*]] = load float, ptr %a
573 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
574 // CHECK: ret <4 x float> [[VLD1_LANE]]
575 float32x4_t test_vld1q_lane_f32(float32_t *a, float32x4_t b) {
576 return vld1q_lane_f32(a, b, 3);
579 // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
580 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
581 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
582 // CHECK: [[TMP4:%.*]] = load double, ptr %a
583 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
584 // CHECK: ret <2 x double> [[VLD1_LANE]]
585 float64x2_t test_vld1q_lane_f64(float64_t *a, float64x2_t b) {
586 return vld1q_lane_f64(a, b, 1);
589 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
590 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
591 // CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
592 // CHECK: ret <16 x i8> [[VLD1_LANE]]
593 poly8x16_t test_vld1q_lane_p8(poly8_t *a, poly8x16_t b) {
594 return vld1q_lane_p8(a, b, 15);
597 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
598 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
599 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
600 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
601 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
602 // CHECK: ret <8 x i16> [[VLD1_LANE]]
603 poly16x8_t test_vld1q_lane_p16(poly16_t *a, poly16x8_t b) {
604 return vld1q_lane_p16(a, b, 7);
607 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
608 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
609 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
610 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
611 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
612 // CHECK: ret <2 x i64> [[VLD1_LANE]]
613 poly64x2_t test_vld1q_lane_p64(poly64_t *a, poly64x2_t b) {
614 return vld1q_lane_p64(a, b, 1);
617 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
618 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
619 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
620 // CHECK: ret <8 x i8> [[VLD1_LANE]]
621 uint8x8_t test_vld1_lane_u8(uint8_t *a, uint8x8_t b) {
622 return vld1_lane_u8(a, b, 7);
625 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
626 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
627 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
628 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
629 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
630 // CHECK: ret <4 x i16> [[VLD1_LANE]]
631 uint16x4_t test_vld1_lane_u16(uint16_t *a, uint16x4_t b) {
632 return vld1_lane_u16(a, b, 3);
635 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
636 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
637 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
638 // CHECK: [[TMP4:%.*]] = load i32, ptr %a
639 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
640 // CHECK: ret <2 x i32> [[VLD1_LANE]]
641 uint32x2_t test_vld1_lane_u32(uint32_t *a, uint32x2_t b) {
642 return vld1_lane_u32(a, b, 1);
645 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
646 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
647 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
648 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
649 // CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
650 // CHECK: ret <1 x i64> [[VLD1_LANE]]
651 uint64x1_t test_vld1_lane_u64(uint64_t *a, uint64x1_t b) {
652 return vld1_lane_u64(a, b, 0);
655 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
656 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
657 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
658 // CHECK: ret <8 x i8> [[VLD1_LANE]]
659 int8x8_t test_vld1_lane_s8(int8_t *a, int8x8_t b) {
660 return vld1_lane_s8(a, b, 7);
663 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
664 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
665 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
666 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
667 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
668 // CHECK: ret <4 x i16> [[VLD1_LANE]]
669 int16x4_t test_vld1_lane_s16(int16_t *a, int16x4_t b) {
670 return vld1_lane_s16(a, b, 3);
673 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
674 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
675 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
676 // CHECK: [[TMP4:%.*]] = load i32, ptr %a
677 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
678 // CHECK: ret <2 x i32> [[VLD1_LANE]]
679 int32x2_t test_vld1_lane_s32(int32_t *a, int32x2_t b) {
680 return vld1_lane_s32(a, b, 1);
683 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
684 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
685 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
686 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
687 // CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
688 // CHECK: ret <1 x i64> [[VLD1_LANE]]
689 int64x1_t test_vld1_lane_s64(int64_t *a, int64x1_t b) {
690 return vld1_lane_s64(a, b, 0);
693 // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
694 // CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
695 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
696 // CHECK: [[TMP4:%.*]] = load half, ptr %a
697 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
698 // CHECK: ret <4 x half> [[VLD1_LANE]]
699 float16x4_t test_vld1_lane_f16(float16_t *a, float16x4_t b) {
700 return vld1_lane_f16(a, b, 3);
703 // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
704 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
705 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
706 // CHECK: [[TMP4:%.*]] = load float, ptr %a
707 // CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
708 // CHECK: ret <2 x float> [[VLD1_LANE]]
709 float32x2_t test_vld1_lane_f32(float32_t *a, float32x2_t b) {
710 return vld1_lane_f32(a, b, 1);
713 // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
714 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
715 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
716 // CHECK: [[TMP4:%.*]] = load double, ptr %a
717 // CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
718 // CHECK: ret <1 x double> [[VLD1_LANE]]
719 float64x1_t test_vld1_lane_f64(float64_t *a, float64x1_t b) {
720 return vld1_lane_f64(a, b, 0);
723 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
724 // CHECK: [[TMP0:%.*]] = load i8, ptr %a
725 // CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
726 // CHECK: ret <8 x i8> [[VLD1_LANE]]
727 poly8x8_t test_vld1_lane_p8(poly8_t *a, poly8x8_t b) {
728 return vld1_lane_p8(a, b, 7);
731 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
732 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
733 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
734 // CHECK: [[TMP4:%.*]] = load i16, ptr %a
735 // CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
736 // CHECK: ret <4 x i16> [[VLD1_LANE]]
737 poly16x4_t test_vld1_lane_p16(poly16_t *a, poly16x4_t b) {
738 return vld1_lane_p16(a, b, 3);
741 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
742 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
743 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
744 // CHECK: [[TMP4:%.*]] = load i64, ptr %a
745 // CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
746 // CHECK: ret <1 x i64> [[VLD1_LANE]]
747 poly64x1_t test_vld1_lane_p64(poly64_t *a, poly64x1_t b) {
748 return vld1_lane_p64(a, b, 0);
751 // CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
752 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
753 // CHECK: [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
754 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
755 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
756 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0
757 // CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
758 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
759 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
760 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
761 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
762 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
763 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
764 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
765 // CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
766 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
767 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
768 // CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
769 // CHECK: ret %struct.int8x16x2_t [[TMP8]]
770 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
771 return vld2q_lane_s8(ptr, src, 15);
774 // CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
775 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
776 // CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
777 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
778 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
779 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0
780 // CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
781 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
782 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
783 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
784 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
785 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
786 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
787 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
788 // CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
789 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
790 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
791 // CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
792 // CHECK: ret %struct.uint8x16x2_t [[TMP8]]
793 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
794 return vld2q_lane_u8(ptr, src, 15);
797 // CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
798 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
799 // CHECK: [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
800 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
801 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
802 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0
803 // CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
804 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
805 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
806 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
807 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
808 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
809 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
810 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
811 // CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
812 // CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
813 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
814 // CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
815 // CHECK: ret %struct.poly8x16x2_t [[TMP8]]
816 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
817 return vld2q_lane_p8(ptr, src, 15);
820 // CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
821 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
822 // CHECK: [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
823 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
824 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
825 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0
826 // CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
827 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
828 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
829 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
830 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
831 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
832 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
833 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
834 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
835 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
836 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
837 // CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
838 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
839 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
840 // CHECK: [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16
841 // CHECK: ret %struct.int8x16x3_t [[TMP9]]
842 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
843 return vld3q_lane_s8(ptr, src, 15);
846 // CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
847 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
848 // CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
849 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
850 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
851 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0
852 // CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
853 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
854 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
855 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
856 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
857 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
858 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
859 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
860 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
861 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
862 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
863 // CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
864 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
865 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
866 // CHECK: [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16
867 // CHECK: ret %struct.uint8x16x3_t [[TMP9]]
868 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
869 return vld3q_lane_u8(ptr, src, 15);
872 // CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
873 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
874 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
875 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
876 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
877 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
878 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
879 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
880 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
881 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
882 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
883 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
884 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
885 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
886 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
887 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
888 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
889 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
890 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
891 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
892 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
893 // CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
894 // CHECK: ret %struct.uint16x8x2_t [[TMP13]]
895 uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) {
896 return vld2q_lane_u16(a, b, 7);
899 // CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
900 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
901 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
902 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
903 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
904 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
905 // CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
906 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
907 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
908 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
909 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
910 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
911 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
912 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
913 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
914 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
915 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
916 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
917 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
918 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
919 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
920 // CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
921 // CHECK: ret %struct.uint32x4x2_t [[TMP13]]
922 uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) {
923 return vld2q_lane_u32(a, b, 3);
926 // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
927 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
928 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
929 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
930 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
931 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
932 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
933 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
934 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
935 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
936 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
937 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
938 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
939 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
940 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
941 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
942 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
943 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
944 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
945 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
946 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
947 // CHECK: [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
948 // CHECK: ret %struct.uint64x2x2_t [[TMP13]]
949 uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) {
950 return vld2q_lane_u64(a, b, 1);
953 // CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
954 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
955 // CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
956 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
957 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
958 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
959 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
960 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
961 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
962 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
963 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
964 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
965 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
966 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
967 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
968 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
969 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
970 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
971 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
972 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
973 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
974 // CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
975 // CHECK: ret %struct.int16x8x2_t [[TMP13]]
976 int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) {
977 return vld2q_lane_s16(a, b, 7);
980 // CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
981 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
982 // CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
983 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
984 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
985 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
986 // CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
987 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
988 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
989 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
990 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
991 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
992 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
993 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
994 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
995 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
996 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
997 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
998 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
999 // CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1000 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1001 // CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
1002 // CHECK: ret %struct.int32x4x2_t [[TMP13]]
1003 int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) {
1004 return vld2q_lane_s32(a, b, 3);
1007 // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1008 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1009 // CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
1010 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
1011 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
1012 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
1013 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1014 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1015 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1016 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1017 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1018 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1019 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1020 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1021 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1022 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1023 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1024 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1025 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1026 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1027 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1028 // CHECK: [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
1029 // CHECK: ret %struct.int64x2x2_t [[TMP13]]
1030 int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) {
1031 return vld2q_lane_s64(a, b, 1);
1034 // CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
1035 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
1036 // CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
1037 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
1038 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
1039 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
1040 // CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1041 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1042 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1043 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1044 // CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1045 // CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1046 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1047 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1048 // CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1049 // CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1050 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1051 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1052 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a)
1053 // CHECK: store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]]
1054 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1055 // CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16
1056 // CHECK: ret %struct.float16x8x2_t [[TMP13]]
1057 float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) {
1058 return vld2q_lane_f16(a, b, 7);
1061 // CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
1062 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
1063 // CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
1064 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
1065 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
1066 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
1067 // CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1068 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1069 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1070 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1071 // CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1072 // CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1073 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1074 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1075 // CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1076 // CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1077 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1078 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1079 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a)
1080 // CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]]
1081 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1082 // CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
1083 // CHECK: ret %struct.float32x4x2_t [[TMP13]]
1084 float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) {
1085 return vld2q_lane_f32(a, b, 3);
1088 // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
1089 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
1090 // CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
1091 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
1092 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
1093 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
1094 // CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1095 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1096 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1097 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1098 // CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1099 // CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1100 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1101 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1102 // CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1103 // CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1104 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1105 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1106 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a)
1107 // CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]]
1108 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1109 // CHECK: [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
1110 // CHECK: ret %struct.float64x2x2_t [[TMP13]]
1111 float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) {
1112 return vld2q_lane_f64(a, b, 1);
1115 // CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1116 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
1117 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
1118 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
1119 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
1120 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
1121 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1122 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1123 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1124 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1125 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1126 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1127 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1128 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1129 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1130 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1131 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1132 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1133 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
1134 // CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1135 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1136 // CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
1137 // CHECK: ret %struct.poly16x8x2_t [[TMP13]]
1138 poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) {
1139 return vld2q_lane_p16(a, b, 7);
1142 // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1143 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
1144 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
1145 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
1146 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
1147 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
1148 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1149 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1150 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1151 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1152 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1153 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1154 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1155 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1156 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1157 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1158 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1159 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1160 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1161 // CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1162 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1163 // CHECK: [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
1164 // CHECK: ret %struct.poly64x2x2_t [[TMP13]]
1165 poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) {
1166 return vld2q_lane_p64(a, b, 1);
1169 // CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1170 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
1171 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1172 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1173 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1174 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
1175 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1176 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1177 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1178 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1179 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1180 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1181 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1182 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1183 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1184 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1185 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1186 // CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
1187 // CHECK: ret %struct.uint8x8x2_t [[TMP8]]
1188 uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) {
1189 return vld2_lane_u8(a, b, 7);
1192 // CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1193 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
1194 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
1195 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1196 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
1197 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
1198 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1199 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1200 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1201 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1202 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1203 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1204 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1205 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1206 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1207 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1208 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1209 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1210 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1211 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1212 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1213 // CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
1214 // CHECK: ret %struct.uint16x4x2_t [[TMP13]]
1215 uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) {
1216 return vld2_lane_u16(a, b, 3);
1219 // CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1220 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
1221 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
1222 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1223 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
1224 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
1225 // CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1226 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1227 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1228 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1229 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1230 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1231 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1232 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1233 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1234 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1235 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1236 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1237 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1238 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1239 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1240 // CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
1241 // CHECK: ret %struct.uint32x2x2_t [[TMP13]]
1242 uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) {
1243 return vld2_lane_u32(a, b, 1);
1246 // CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1247 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
1248 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
1249 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1250 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
1251 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
1252 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1253 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1254 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1255 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1256 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1257 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1258 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1259 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1260 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1261 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1262 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1263 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1264 // CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1265 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1266 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1267 // CHECK: [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8
1268 // CHECK: ret %struct.uint64x1x2_t [[TMP13]]
1269 uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) {
1270 return vld2_lane_u64(a, b, 0);
1273 // CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1274 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
1275 // CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
1276 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1277 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
1278 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
1279 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1280 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1281 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1282 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1283 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1284 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1285 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1286 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1287 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1288 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1289 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1290 // CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
1291 // CHECK: ret %struct.int8x8x2_t [[TMP8]]
1292 int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) {
1293 return vld2_lane_s8(a, b, 7);
1296 // CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1297 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
1298 // CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
1299 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1300 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
1301 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
1302 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1303 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1304 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1305 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1306 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1307 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1308 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1309 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1310 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1311 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1312 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1313 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1314 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1315 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1316 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1317 // CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
1318 // CHECK: ret %struct.int16x4x2_t [[TMP13]]
1319 int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) {
1320 return vld2_lane_s16(a, b, 3);
1323 // CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1324 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
1325 // CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
1326 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1327 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
1328 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
1329 // CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1330 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1331 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1332 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1333 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1334 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1335 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1336 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1337 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1338 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1339 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1340 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1341 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1342 // CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1343 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1344 // CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
1345 // CHECK: ret %struct.int32x2x2_t [[TMP13]]
1346 int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) {
1347 return vld2_lane_s32(a, b, 1);
1350 // CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1351 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
1352 // CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
1353 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1354 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
1355 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
1356 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1357 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1358 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1359 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1360 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1361 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1362 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1363 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1364 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1365 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1366 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1367 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1368 // CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1369 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1370 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1371 // CHECK: [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8
1372 // CHECK: ret %struct.int64x1x2_t [[TMP13]]
1373 int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) {
1374 return vld2_lane_s64(a, b, 0);
1377 // CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
1378 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
1379 // CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
1380 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1381 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
1382 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
1383 // CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1384 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1385 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1386 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
1387 // CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
1388 // CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
1389 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1390 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
1391 // CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
1392 // CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
1393 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
1394 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
1395 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a)
1396 // CHECK: store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]]
1397 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1398 // CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8
1399 // CHECK: ret %struct.float16x4x2_t [[TMP13]]
1400 float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) {
1401 return vld2_lane_f16(a, b, 3);
1404 // CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
1405 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
1406 // CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
1407 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1408 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
1409 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
1410 // CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1411 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1412 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1413 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
1414 // CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
1415 // CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
1416 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1417 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
1418 // CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
1419 // CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
1420 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
1421 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
1422 // CHECK: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a)
1423 // CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]]
1424 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1425 // CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
1426 // CHECK: ret %struct.float32x2x2_t [[TMP13]]
1427 float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) {
1428 return vld2_lane_f32(a, b, 1);
1431 // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
1432 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
1433 // CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
1434 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
1435 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
1436 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
1437 // CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1438 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1439 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1440 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
1441 // CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
1442 // CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
1443 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1444 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
1445 // CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
1446 // CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
1447 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
1448 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
1449 // CHECK: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a)
1450 // CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]]
1451 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1452 // CHECK: [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
1453 // CHECK: ret %struct.float64x1x2_t [[TMP13]]
1454 float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) {
1455 return vld2_lane_f64(a, b, 0);
1458 // CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1459 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
1460 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
1461 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1462 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
1463 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
1464 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1465 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1466 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1467 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1468 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1469 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1470 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1471 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1472 // CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1473 // CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1474 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1475 // CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
1476 // CHECK: ret %struct.poly8x8x2_t [[TMP8]]
1477 poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) {
1478 return vld2_lane_p8(a, b, 7);
1481 // CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1482 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
1483 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
1484 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1485 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
1486 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
1487 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1488 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1489 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1490 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1491 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1492 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1493 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1494 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1495 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1496 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1497 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1498 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1499 // CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1500 // CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1501 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1502 // CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
1503 // CHECK: ret %struct.poly16x4x2_t [[TMP13]]
1504 poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) {
1505 return vld2_lane_p16(a, b, 3);
1508 // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1509 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
1510 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
1511 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
1512 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
1513 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
1514 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1515 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1516 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1517 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1518 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1519 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1520 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1521 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1522 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1523 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1524 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1525 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1526 // CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1527 // CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1528 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1529 // CHECK: [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
1530 // CHECK: ret %struct.poly64x1x2_t [[TMP13]]
1531 poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) {
1532 return vld2_lane_p64(a, b, 0);
1535 // CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1536 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1537 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
1538 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
1539 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
1540 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
1541 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1542 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1543 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1544 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1545 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1546 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1547 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1548 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1549 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1550 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1551 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1552 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1553 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1554 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1555 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1556 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1557 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1558 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1559 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1560 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1561 // CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16
1562 // CHECK: ret %struct.uint16x8x3_t [[TMP16]]
1563 uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) {
1564 return vld3q_lane_u16(a, b, 7);
1567 // CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1568 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1569 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
1570 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
1571 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
1572 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
1573 // CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1574 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1575 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1576 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1577 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1578 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1579 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1580 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1581 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1582 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1583 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1584 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1585 // CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1586 // CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1587 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1588 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1589 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1590 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1591 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1592 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1593 // CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16
1594 // CHECK: ret %struct.uint32x4x3_t [[TMP16]]
1595 uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) {
1596 return vld3q_lane_u32(a, b, 3);
1599 // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1600 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
1601 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
1602 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
1603 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
1604 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
1605 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1606 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1607 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1608 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1609 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1610 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1611 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1612 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1613 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1614 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1615 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1616 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1617 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1618 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1619 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1620 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1621 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1622 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1623 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1624 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1625 // CHECK: [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
1626 // CHECK: ret %struct.uint64x2x3_t [[TMP16]]
1627 uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) {
1628 return vld3q_lane_u64(a, b, 1);
1631 // CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1632 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
1633 // CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
1634 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
1635 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
1636 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
1637 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1638 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1639 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1640 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1641 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1642 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1643 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1644 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1645 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1646 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1647 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1648 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1649 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1650 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1651 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1652 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1653 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1654 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1655 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1656 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1657 // CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16
1658 // CHECK: ret %struct.int16x8x3_t [[TMP16]]
1659 int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) {
1660 return vld3q_lane_s16(a, b, 7);
1663 // CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1664 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
1665 // CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
1666 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
1667 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
1668 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
1669 // CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1670 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1671 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1672 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1673 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1674 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1675 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1676 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1677 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1678 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1679 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1680 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1681 // CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1682 // CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1683 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1684 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1685 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1686 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1687 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1688 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1689 // CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16
1690 // CHECK: ret %struct.int32x4x3_t [[TMP16]]
1691 int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) {
1692 return vld3q_lane_s32(a, b, 3);
1695 // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1696 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
1697 // CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
1698 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
1699 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
1700 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
1701 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1702 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1703 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1704 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1705 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1706 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1707 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1708 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1709 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1710 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1711 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1712 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1713 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1714 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1715 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1716 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1717 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1718 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1719 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1720 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1721 // CHECK: [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
1722 // CHECK: ret %struct.int64x2x3_t [[TMP16]]
1723 int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) {
1724 return vld3q_lane_s64(a, b, 1);
1727 // CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
1728 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
1729 // CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
1730 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
1731 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
1732 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
1733 // CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1734 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1735 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1736 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1737 // CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1738 // CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1739 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1740 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1741 // CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1742 // CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1743 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1744 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
1745 // CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
1746 // CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
1747 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1748 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1749 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
1750 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a)
1751 // CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]]
1752 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1753 // CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16
1754 // CHECK: ret %struct.float16x8x3_t [[TMP16]]
1755 float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) {
1756 return vld3q_lane_f16(a, b, 7);
1759 // CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
1760 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
1761 // CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
1762 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
1763 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
1764 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
1765 // CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1766 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1767 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1768 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1769 // CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1770 // CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1771 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1772 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1773 // CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1774 // CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1775 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1776 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
1777 // CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
1778 // CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
1779 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1780 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1781 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
1782 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a)
1783 // CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]]
1784 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1785 // CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16
1786 // CHECK: ret %struct.float32x4x3_t [[TMP16]]
1787 float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) {
1788 return vld3q_lane_f32(a, b, 3);
1791 // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
1792 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
1793 // CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
1794 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
1795 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
1796 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
1797 // CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1798 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1799 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1800 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1801 // CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1802 // CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1803 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1804 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1805 // CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1806 // CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1807 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1808 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
1809 // CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
1810 // CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
1811 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1812 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1813 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
1814 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a)
1815 // CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]]
1816 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1817 // CHECK: [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
1818 // CHECK: ret %struct.float64x2x3_t [[TMP16]]
1819 float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) {
1820 return vld3q_lane_f64(a, b, 1);
1823 // CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
1824 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
1825 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
1826 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
1827 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
1828 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
1829 // CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1830 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1831 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1832 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
1833 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
1834 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1835 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
1836 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
1837 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1838 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
1839 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
1840 // CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
1841 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1842 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1843 // CHECK: [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16
1844 // CHECK: ret %struct.poly8x16x3_t [[TMP9]]
1845 poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) {
1846 return vld3q_lane_p8(a, b, 15);
1849 // CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1850 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
1851 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
1852 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
1853 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
1854 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
1855 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1856 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1857 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1858 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1859 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1860 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1861 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1862 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1863 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1864 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1865 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1866 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1867 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1868 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1869 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1870 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1871 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1872 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1873 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1874 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1875 // CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16
1876 // CHECK: ret %struct.poly16x8x3_t [[TMP16]]
1877 poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) {
1878 return vld3q_lane_p16(a, b, 7);
1881 // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1882 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
1883 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
1884 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
1885 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
1886 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
1887 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1888 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1889 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1890 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1891 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1892 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1893 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1894 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1895 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1896 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1897 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1898 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1899 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1900 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1901 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1902 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1903 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1904 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1905 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1906 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1907 // CHECK: [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
1908 // CHECK: ret %struct.poly64x2x3_t [[TMP16]]
1909 poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) {
1910 return vld3q_lane_p64(a, b, 1);
1913 // CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1914 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
1915 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
1916 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
1917 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
1918 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
1919 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1920 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1921 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1922 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1923 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1924 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1925 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1926 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1927 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1928 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
1929 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
1930 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
1931 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1932 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1933 // CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8
1934 // CHECK: ret %struct.uint8x8x3_t [[TMP9]]
1935 uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) {
1936 return vld3_lane_u8(a, b, 7);
1939 // CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1940 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
1941 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
1942 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
1943 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
1944 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
1945 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1946 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1947 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1948 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1949 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1950 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1951 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1952 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1953 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1954 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1955 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1956 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
1957 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
1958 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
1959 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1960 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1961 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
1962 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
1963 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1964 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1965 // CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8
1966 // CHECK: ret %struct.uint16x4x3_t [[TMP16]]
1967 uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) {
1968 return vld3_lane_u16(a, b, 3);
1971 // CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1972 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
1973 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
1974 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
1975 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
1976 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
1977 // CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1978 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1979 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1980 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1981 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1982 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1983 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1984 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1985 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1986 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1987 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1988 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
1989 // CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
1990 // CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
1991 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1992 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1993 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
1994 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
1995 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1996 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1997 // CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8
1998 // CHECK: ret %struct.uint32x2x3_t [[TMP16]]
1999 uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) {
2000 return vld3_lane_u32(a, b, 1);
2003 // CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2004 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
2005 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
2006 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
2007 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
2008 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
2009 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2010 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2011 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2012 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2013 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2014 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2015 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2016 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2017 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2018 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2019 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2020 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2021 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2022 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2023 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2024 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2025 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2026 // CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2027 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2028 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2029 // CHECK: [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8
2030 // CHECK: ret %struct.uint64x1x3_t [[TMP16]]
2031 uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) {
2032 return vld3_lane_u64(a, b, 0);
2035 // CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2036 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
2037 // CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
2038 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
2039 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
2040 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
2041 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2042 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2043 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2044 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2045 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2046 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2047 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2048 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2049 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2050 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2051 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2052 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2053 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2054 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2055 // CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8
2056 // CHECK: ret %struct.int8x8x3_t [[TMP9]]
2057 int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) {
2058 return vld3_lane_s8(a, b, 7);
2061 // CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2062 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
2063 // CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
2064 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
2065 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
2066 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
2067 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2068 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2069 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2070 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2071 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2072 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2073 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2074 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2075 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2076 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2077 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2078 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2079 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2080 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2081 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2082 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2083 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2084 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2085 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2086 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2087 // CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8
2088 // CHECK: ret %struct.int16x4x3_t [[TMP16]]
2089 int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) {
2090 return vld3_lane_s16(a, b, 3);
2093 // CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2094 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
2095 // CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
2096 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
2097 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
2098 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
2099 // CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2100 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2101 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2102 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2103 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2104 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2105 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2106 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2107 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2108 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2109 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2110 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2111 // CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2112 // CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2113 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2114 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2115 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2116 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
2117 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
2118 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2119 // CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8
2120 // CHECK: ret %struct.int32x2x3_t [[TMP16]]
2121 int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) {
2122 return vld3_lane_s32(a, b, 1);
2125 // CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2126 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
2127 // CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
2128 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
2129 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
2130 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
2131 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2132 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2133 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2134 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2135 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2136 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2137 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2138 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2139 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2140 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2141 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2142 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2143 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2144 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2145 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2146 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2147 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2148 // CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2149 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2150 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2151 // CHECK: [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8
2152 // CHECK: ret %struct.int64x1x3_t [[TMP16]]
2153 int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) {
2154 return vld3_lane_s64(a, b, 0);
2157 // CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
2158 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
2159 // CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
2160 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
2161 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
2162 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
2163 // CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2164 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2165 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2166 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
2167 // CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
2168 // CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
2169 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2170 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
2171 // CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
2172 // CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
2173 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2174 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
2175 // CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
2176 // CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
2177 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
2178 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
2179 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
2180 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a)
2181 // CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]]
2182 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2183 // CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8
2184 // CHECK: ret %struct.float16x4x3_t [[TMP16]]
2185 float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) {
2186 return vld3_lane_f16(a, b, 3);
2189 // CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
2190 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
2191 // CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
2192 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
2193 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
2194 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
2195 // CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2196 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2197 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2198 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
2199 // CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
2200 // CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
2201 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2202 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
2203 // CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
2204 // CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
2205 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2206 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
2207 // CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
2208 // CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
2209 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
2210 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
2211 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
2212 // CHECK: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a)
2213 // CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]]
2214 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2215 // CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8
2216 // CHECK: ret %struct.float32x2x3_t [[TMP16]]
2217 float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) {
2218 return vld3_lane_f32(a, b, 1);
2221 // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
2222 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
2223 // CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
2224 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
2225 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
2226 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
2227 // CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2228 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2229 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2230 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
2231 // CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
2232 // CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
2233 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2234 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
2235 // CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
2236 // CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
2237 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2238 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
2239 // CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
2240 // CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
2241 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
2242 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
2243 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
2244 // CHECK: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a)
2245 // CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]]
2246 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2247 // CHECK: [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
2248 // CHECK: ret %struct.float64x1x3_t [[TMP16]]
2249 float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) {
2250 return vld3_lane_f64(a, b, 0);
2253 // CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2254 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
2255 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
2256 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
2257 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
2258 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
2259 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2260 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2261 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2262 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2263 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2264 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2265 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2266 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2267 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2268 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2269 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2270 // CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2271 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2272 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2273 // CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8
2274 // CHECK: ret %struct.poly8x8x3_t [[TMP9]]
2275 poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) {
2276 return vld3_lane_p8(a, b, 7);
2279 // CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2280 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
2281 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
2282 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
2283 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
2284 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
2285 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2286 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2287 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2288 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2289 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2290 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2291 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2292 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2293 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2294 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2295 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2296 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2297 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2298 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2299 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2300 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2301 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2302 // CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2303 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2304 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2305 // CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8
2306 // CHECK: ret %struct.poly16x4x3_t [[TMP16]]
2307 poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) {
2308 return vld3_lane_p16(a, b, 3);
2311 // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2312 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
2313 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
2314 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
2315 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
2316 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
2317 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2318 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2319 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2320 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2321 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2322 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2323 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2324 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2325 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2326 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2327 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2328 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2329 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2330 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2331 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2332 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2333 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2334 // CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2335 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2336 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2337 // CHECK: [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
2338 // CHECK: ret %struct.poly64x1x3_t [[TMP16]]
2339 poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) {
2340 return vld3_lane_p64(a, b, 0);
2343 // CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2344 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
2345 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
2346 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
2347 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
2348 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
2349 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2350 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2351 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2352 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2353 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2354 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2355 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2356 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2357 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2358 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2359 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2360 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2361 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2362 // CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2363 // CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2364 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2365 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2366 // CHECK: [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16
2367 // CHECK: ret %struct.uint8x16x4_t [[TMP10]]
2368 uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
2369 return vld4q_lane_u8(a, b, 15);
2372 // CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2373 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
2374 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
2375 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
2376 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
2377 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
2378 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2379 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2380 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2381 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2382 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2383 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2384 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2385 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2386 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2387 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2388 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2389 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2390 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2391 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2392 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2393 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2394 // CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2395 // CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2396 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2397 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2398 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2399 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2400 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2401 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2402 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2403 // CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16
2404 // CHECK: ret %struct.uint16x8x4_t [[TMP19]]
2405 uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) {
2406 return vld4q_lane_u16(a, b, 7);
2409 // CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2410 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
2411 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
2412 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
2413 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
2414 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
2415 // CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2416 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2417 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2418 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2419 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2420 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2421 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2422 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2423 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2424 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2425 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2426 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2427 // CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2428 // CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2429 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2430 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2431 // CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2432 // CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2433 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2434 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2435 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2436 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2437 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2438 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2439 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2440 // CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16
2441 // CHECK: ret %struct.uint32x4x4_t [[TMP19]]
2442 uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) {
2443 return vld4q_lane_u32(a, b, 3);
2446 // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2447 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
2448 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
2449 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
2450 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
2451 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
2452 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2453 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2454 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2455 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2456 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2457 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2458 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2459 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2460 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2461 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2462 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2463 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2464 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2465 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2466 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2467 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2468 // CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2469 // CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2470 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2471 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2472 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2473 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2474 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2475 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2476 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2477 // CHECK: [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
2478 // CHECK: ret %struct.uint64x2x4_t [[TMP19]]
2479 uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) {
2480 return vld4q_lane_u64(a, b, 1);
2483 // CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2484 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
2485 // CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
2486 // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
2487 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
2488 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
2489 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2490 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2491 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2492 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2493 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2494 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2495 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2496 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2497 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2498 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2499 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2500 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2501 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2502 // CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2503 // CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2504 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2505 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2506 // CHECK: [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16
2507 // CHECK: ret %struct.int8x16x4_t [[TMP10]]
2508 int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) {
2509 return vld4q_lane_s8(a, b, 15);
2512 // CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2513 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
2514 // CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
2515 // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
2516 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
2517 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
2518 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2519 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2520 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2521 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2522 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2523 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2524 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2525 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2526 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2527 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2528 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2529 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2530 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2531 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2532 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2533 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2534 // CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2535 // CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2536 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2537 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2538 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2539 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2540 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2541 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2542 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2543 // CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16
2544 // CHECK: ret %struct.int16x8x4_t [[TMP19]]
2545 int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) {
2546 return vld4q_lane_s16(a, b, 7);
2549 // CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2550 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
2551 // CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
2552 // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
2553 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
2554 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
2555 // CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2556 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2557 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2558 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2559 // CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2560 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2561 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2562 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2563 // CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2564 // CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2565 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2566 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2567 // CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2568 // CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2569 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2570 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2571 // CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2572 // CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2573 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2574 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2575 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2576 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2577 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2578 // CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2579 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2580 // CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16
2581 // CHECK: ret %struct.int32x4x4_t [[TMP19]]
2582 int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) {
2583 return vld4q_lane_s32(a, b, 3);
2586 // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2587 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
2588 // CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
2589 // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
2590 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
2591 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
2592 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2593 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2594 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2595 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2596 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2597 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2598 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2599 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2600 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2601 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2602 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2603 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2604 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2605 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2606 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2607 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2608 // CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2609 // CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2610 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2611 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2612 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2613 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2614 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2615 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2616 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2617 // CHECK: [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
2618 // CHECK: ret %struct.int64x2x4_t [[TMP19]]
2619 int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) {
2620 return vld4q_lane_s64(a, b, 1);
2623 // CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
2624 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
2625 // CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
2626 // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
2627 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
2628 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
2629 // CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2630 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2631 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2632 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
2633 // CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
2634 // CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
2635 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2636 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
2637 // CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
2638 // CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
2639 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2640 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
2641 // CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
2642 // CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
2643 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2644 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
2645 // CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
2646 // CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
2647 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
2648 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
2649 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
2650 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
2651 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a)
2652 // CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]]
2653 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2654 // CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16
2655 // CHECK: ret %struct.float16x8x4_t [[TMP19]]
2656 float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) {
2657 return vld4q_lane_f16(a, b, 7);
2660 // CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
2661 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
2662 // CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
2663 // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
2664 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
2665 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
2666 // CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2667 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2668 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2669 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
2670 // CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
2671 // CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
2672 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2673 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
2674 // CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
2675 // CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
2676 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2677 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
2678 // CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
2679 // CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
2680 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2681 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
2682 // CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
2683 // CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
2684 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
2685 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
2686 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
2687 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
2688 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a)
2689 // CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]]
2690 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2691 // CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16
2692 // CHECK: ret %struct.float32x4x4_t [[TMP19]]
2693 float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) {
2694 return vld4q_lane_f32(a, b, 3);
2697 // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
2698 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
2699 // CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
2700 // CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
2701 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
2702 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
2703 // CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2704 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2705 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2706 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
2707 // CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
2708 // CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
2709 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2710 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
2711 // CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
2712 // CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
2713 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2714 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
2715 // CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
2716 // CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
2717 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2718 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
2719 // CHECK: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
2720 // CHECK: [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
2721 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
2722 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
2723 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
2724 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
2725 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a)
2726 // CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]]
2727 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2728 // CHECK: [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
2729 // CHECK: ret %struct.float64x2x4_t [[TMP19]]
2730 float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) {
2731 return vld4q_lane_f64(a, b, 1);
2734 // CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2735 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
2736 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
2737 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
2738 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
2739 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
2740 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2741 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2742 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2743 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2744 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2745 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2746 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2747 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2748 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2749 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2750 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2751 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2752 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2753 // CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2754 // CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2755 // CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2756 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2757 // CHECK: [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16
2758 // CHECK: ret %struct.poly8x16x4_t [[TMP10]]
2759 poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) {
2760 return vld4q_lane_p8(a, b, 15);
2763 // CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2764 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
2765 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
2766 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
2767 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
2768 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
2769 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2770 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2771 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2772 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2773 // CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2774 // CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2775 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2776 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2777 // CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2778 // CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2779 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2780 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2781 // CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2782 // CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2783 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2784 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2785 // CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2786 // CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2787 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2788 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2789 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2790 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2791 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2792 // CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2793 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2794 // CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16
2795 // CHECK: ret %struct.poly16x8x4_t [[TMP19]]
2796 poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) {
2797 return vld4q_lane_p16(a, b, 7);
2800 // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2801 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
2802 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
2803 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
2804 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
2805 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
2806 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2807 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2808 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2809 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2810 // CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2811 // CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2812 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2813 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2814 // CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2815 // CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2816 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2817 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2818 // CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2819 // CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2820 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2821 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2822 // CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2823 // CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2824 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2825 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2826 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2827 // CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2828 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2829 // CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2830 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2831 // CHECK: [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
2832 // CHECK: ret %struct.poly64x2x4_t [[TMP19]]
2833 poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) {
2834 return vld4q_lane_p64(a, b, 1);
2837 // CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2838 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
2839 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
2840 // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
2841 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
2842 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
2843 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2844 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2845 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2846 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2847 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2848 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2849 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2850 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2851 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2852 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2853 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2854 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2855 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2856 // CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2857 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2858 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2859 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2860 // CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8
2861 // CHECK: ret %struct.uint8x8x4_t [[TMP10]]
2862 uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) {
2863 return vld4_lane_u8(a, b, 7);
2866 // CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2867 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
2868 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
2869 // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
2870 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
2871 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
2872 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2873 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2874 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2875 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2876 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2877 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2878 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2879 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2880 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2881 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2882 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2883 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2884 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2885 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2886 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2887 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
2888 // CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
2889 // CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
2890 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2891 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2892 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2893 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
2894 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
2895 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2896 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2897 // CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8
2898 // CHECK: ret %struct.uint16x4x4_t [[TMP19]]
2899 uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) {
2900 return vld4_lane_u16(a, b, 3);
2903 // CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2904 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
2905 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
2906 // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
2907 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
2908 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
2909 // CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2910 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2911 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2912 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2913 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2914 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2915 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2916 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2917 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2918 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2919 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2920 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2921 // CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2922 // CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2923 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2924 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
2925 // CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
2926 // CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
2927 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2928 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2929 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2930 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
2931 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
2932 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2933 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2934 // CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8
2935 // CHECK: ret %struct.uint32x2x4_t [[TMP19]]
2936 uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) {
2937 return vld4_lane_u32(a, b, 1);
2940 // CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2941 // CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
2942 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
2943 // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
2944 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
2945 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
2946 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2947 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2948 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2949 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2950 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2951 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2952 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2953 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2954 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2955 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2956 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2957 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2958 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2959 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2960 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2961 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
2962 // CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
2963 // CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
2964 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2965 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2966 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2967 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
2968 // CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
2969 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2970 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2971 // CHECK: [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8
2972 // CHECK: ret %struct.uint64x1x4_t [[TMP19]]
2973 uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) {
2974 return vld4_lane_u64(a, b, 0);
2977 // CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2978 // CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
2979 // CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
2980 // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
2981 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
2982 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
2983 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2984 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2985 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2986 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2987 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2988 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2989 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2990 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2991 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2992 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2993 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2994 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2995 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2996 // CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2997 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2998 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2999 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3000 // CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8
3001 // CHECK: ret %struct.int8x8x4_t [[TMP10]]
3002 int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) {
3003 return vld4_lane_s8(a, b, 7);
3006 // CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3007 // CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
3008 // CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
3009 // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
3010 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
3011 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
3012 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3013 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3014 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3015 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3016 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3017 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3018 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3019 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3020 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3021 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3022 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3023 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3024 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3025 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3026 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3027 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3028 // CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3029 // CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3030 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3031 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3032 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3033 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3034 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3035 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3036 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3037 // CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8
3038 // CHECK: ret %struct.int16x4x4_t [[TMP19]]
3039 int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) {
3040 return vld4_lane_s16(a, b, 3);
3043 // CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3044 // CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
3045 // CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
3046 // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
3047 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
3048 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
3049 // CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3050 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3051 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3052 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3053 // CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3054 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
3055 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3056 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3057 // CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3058 // CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
3059 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3060 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
3061 // CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
3062 // CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
3063 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3064 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
3065 // CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
3066 // CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
3067 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
3068 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
3069 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
3070 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
3071 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
3072 // CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
3073 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3074 // CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8
3075 // CHECK: ret %struct.int32x2x4_t [[TMP19]]
3076 int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) {
3077 return vld4_lane_s32(a, b, 1);
3080 // CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3081 // CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
3082 // CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
3083 // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
3084 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
3085 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
3086 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3087 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3088 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3089 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3090 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3091 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3092 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3093 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3094 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3095 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3096 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3097 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3098 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3099 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3100 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3101 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3102 // CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3103 // CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3104 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3105 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3106 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3107 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3108 // CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3109 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3110 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3111 // CHECK: [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8
3112 // CHECK: ret %struct.int64x1x4_t [[TMP19]]
3113 int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) {
3114 return vld4_lane_s64(a, b, 0);
3117 // CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
3118 // CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
3119 // CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
3120 // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
3121 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
3122 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
3123 // CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3124 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3125 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3126 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
3127 // CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
3128 // CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
3129 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3130 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
3131 // CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
3132 // CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
3133 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3134 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
3135 // CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
3136 // CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
3137 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3138 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
3139 // CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
3140 // CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
3141 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
3142 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
3143 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
3144 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
3145 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a)
3146 // CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]]
3147 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3148 // CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8
3149 // CHECK: ret %struct.float16x4x4_t [[TMP19]]
3150 float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) {
3151 return vld4_lane_f16(a, b, 3);
3154 // CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
3155 // CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
3156 // CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
3157 // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
3158 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
3159 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
3160 // CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3161 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3162 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3163 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
3164 // CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
3165 // CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
3166 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3167 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
3168 // CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
3169 // CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
3170 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3171 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
3172 // CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
3173 // CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
3174 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3175 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
3176 // CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
3177 // CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
3178 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
3179 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
3180 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
3181 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
3182 // CHECK: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a)
3183 // CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]]
3184 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3185 // CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8
3186 // CHECK: ret %struct.float32x2x4_t [[TMP19]]
3187 float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) {
3188 return vld4_lane_f32(a, b, 1);
3191 // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
3192 // CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
3193 // CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
3194 // CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
3195 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
3196 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
3197 // CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3198 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3199 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3200 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
3201 // CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
3202 // CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
3203 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3204 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
3205 // CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
3206 // CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
3207 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3208 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
3209 // CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
3210 // CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
3211 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3212 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
3213 // CHECK: [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
3214 // CHECK: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
3215 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
3216 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
3217 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
3218 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
3219 // CHECK: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a)
3220 // CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]]
3221 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3222 // CHECK: [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
3223 // CHECK: ret %struct.float64x1x4_t [[TMP19]]
3224 float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) {
3225 return vld4_lane_f64(a, b, 0);
3228 // CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3229 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
3230 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
3231 // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
3232 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
3233 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
3234 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3235 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3236 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3237 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3238 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3239 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3240 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3241 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3242 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3243 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
3244 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
3245 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3246 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
3247 // CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
3248 // CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
3249 // CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
3250 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3251 // CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8
3252 // CHECK: ret %struct.poly8x8x4_t [[TMP10]]
3253 poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) {
3254 return vld4_lane_p8(a, b, 7);
3257 // CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3258 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
3259 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
3260 // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
3261 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
3262 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
3263 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3264 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3265 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3266 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3267 // CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3268 // CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3269 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3270 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3271 // CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3272 // CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3273 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3274 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3275 // CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3276 // CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3277 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3278 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3279 // CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3280 // CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3281 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3282 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3283 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3284 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3285 // CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3286 // CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3287 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3288 // CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8
3289 // CHECK: ret %struct.poly16x4x4_t [[TMP19]]
3290 poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) {
3291 return vld4_lane_p16(a, b, 3);
3294 // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3295 // CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
3296 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
3297 // CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
3298 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
3299 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
3300 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3301 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3302 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3303 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3304 // CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3305 // CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3306 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3307 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3308 // CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3309 // CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3310 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3311 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3312 // CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3313 // CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3314 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3315 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3316 // CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3317 // CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3318 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3319 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3320 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3321 // CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3322 // CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3323 // CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3324 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3325 // CHECK: [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
3326 // CHECK: ret %struct.poly64x1x4_t [[TMP19]]
3327 poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) {
3328 return vld4_lane_p64(a, b, 0);
3331 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3332 // CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3333 // CHECK: store i8 [[TMP0]], ptr %a
3334 // CHECK: ret void
3335 void test_vst1q_lane_u8(uint8_t *a, uint8x16_t b) {
3336 vst1q_lane_u8(a, b, 15);
3339 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3340 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3341 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3342 // CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3343 // CHECK: store i16 [[TMP3]], ptr %a
3344 // CHECK: ret void
3345 void test_vst1q_lane_u16(uint16_t *a, uint16x8_t b) {
3346 vst1q_lane_u16(a, b, 7);
3349 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3350 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3351 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3352 // CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3353 // CHECK: store i32 [[TMP3]], ptr %a
3354 // CHECK: ret void
3355 void test_vst1q_lane_u32(uint32_t *a, uint32x4_t b) {
3356 vst1q_lane_u32(a, b, 3);
3359 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3360 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3361 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3362 // CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3363 // CHECK: store i64 [[TMP3]], ptr %a
3364 // CHECK: ret void
3365 void test_vst1q_lane_u64(uint64_t *a, uint64x2_t b) {
3366 vst1q_lane_u64(a, b, 1);
3369 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3370 // CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3371 // CHECK: store i8 [[TMP0]], ptr %a
3372 // CHECK: ret void
3373 void test_vst1q_lane_s8(int8_t *a, int8x16_t b) {
3374 vst1q_lane_s8(a, b, 15);
3377 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3378 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3379 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3380 // CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3381 // CHECK: store i16 [[TMP3]], ptr %a
3382 // CHECK: ret void
3383 void test_vst1q_lane_s16(int16_t *a, int16x8_t b) {
3384 vst1q_lane_s16(a, b, 7);
3387 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3388 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3389 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3390 // CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3391 // CHECK: store i32 [[TMP3]], ptr %a
3392 // CHECK: ret void
3393 void test_vst1q_lane_s32(int32_t *a, int32x4_t b) {
3394 vst1q_lane_s32(a, b, 3);
3397 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3398 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3399 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3400 // CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3401 // CHECK: store i64 [[TMP3]], ptr %a
3402 // CHECK: ret void
3403 void test_vst1q_lane_s64(int64_t *a, int64x2_t b) {
3404 vst1q_lane_s64(a, b, 1);
3407 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
3408 // CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
3409 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
3410 // CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
3411 // CHECK: store half [[TMP3]], ptr %a
3412 // CHECK: ret void
3413 void test_vst1q_lane_f16(float16_t *a, float16x8_t b) {
3414 vst1q_lane_f16(a, b, 7);
3417 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
3418 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3419 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3420 // CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
3421 // CHECK: store float [[TMP3]], ptr %a
3422 // CHECK: ret void
3423 void test_vst1q_lane_f32(float32_t *a, float32x4_t b) {
3424 vst1q_lane_f32(a, b, 3);
3427 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
3428 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
3429 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3430 // CHECK: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
3431 // CHECK: store double [[TMP3]], ptr %a
3432 // CHECK: ret void
3433 void test_vst1q_lane_f64(float64_t *a, float64x2_t b) {
3434 vst1q_lane_f64(a, b, 1);
3437 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3438 // CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3439 // CHECK: store i8 [[TMP0]], ptr %a
3440 // CHECK: ret void
3441 void test_vst1q_lane_p8(poly8_t *a, poly8x16_t b) {
3442 vst1q_lane_p8(a, b, 15);
3445 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3446 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3447 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3448 // CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3449 // CHECK: store i16 [[TMP3]], ptr %a
3450 // CHECK: ret void
3451 void test_vst1q_lane_p16(poly16_t *a, poly16x8_t b) {
3452 vst1q_lane_p16(a, b, 7);
3455 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3456 // CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3457 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3458 // CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3459 // CHECK: store i64 [[TMP3]], ptr %a
3460 // CHECK: ret void
3461 void test_vst1q_lane_p64(poly64_t *a, poly64x2_t b) {
3462 vst1q_lane_p64(a, b, 1);
3465 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3466 // CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3467 // CHECK: store i8 [[TMP0]], ptr %a
3468 // CHECK: ret void
3469 void test_vst1_lane_u8(uint8_t *a, uint8x8_t b) {
3470 vst1_lane_u8(a, b, 7);
3473 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3474 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3475 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3476 // CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3477 // CHECK: store i16 [[TMP3]], ptr %a
3478 // CHECK: ret void
3479 void test_vst1_lane_u16(uint16_t *a, uint16x4_t b) {
3480 vst1_lane_u16(a, b, 3);
3483 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3484 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3485 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3486 // CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3487 // CHECK: store i32 [[TMP3]], ptr %a
3488 // CHECK: ret void
3489 void test_vst1_lane_u32(uint32_t *a, uint32x2_t b) {
3490 vst1_lane_u32(a, b, 1);
3493 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3494 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3495 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3496 // CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3497 // CHECK: store i64 [[TMP3]], ptr %a
3498 // CHECK: ret void
3499 void test_vst1_lane_u64(uint64_t *a, uint64x1_t b) {
3500 vst1_lane_u64(a, b, 0);
3503 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3504 // CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3505 // CHECK: store i8 [[TMP0]], ptr %a
3506 // CHECK: ret void
3507 void test_vst1_lane_s8(int8_t *a, int8x8_t b) {
3508 vst1_lane_s8(a, b, 7);
3511 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3512 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3513 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3514 // CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3515 // CHECK: store i16 [[TMP3]], ptr %a
3516 // CHECK: ret void
3517 void test_vst1_lane_s16(int16_t *a, int16x4_t b) {
3518 vst1_lane_s16(a, b, 3);
3521 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3522 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3523 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3524 // CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3525 // CHECK: store i32 [[TMP3]], ptr %a
3526 // CHECK: ret void
3527 void test_vst1_lane_s32(int32_t *a, int32x2_t b) {
3528 vst1_lane_s32(a, b, 1);
3531 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3532 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3533 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3534 // CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3535 // CHECK: store i64 [[TMP3]], ptr %a
3536 // CHECK: ret void
3537 void test_vst1_lane_s64(int64_t *a, int64x1_t b) {
3538 vst1_lane_s64(a, b, 0);
3541 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
3542 // CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
3543 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
3544 // CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
3545 // CHECK: store half [[TMP3]], ptr %a
3546 // CHECK: ret void
3547 void test_vst1_lane_f16(float16_t *a, float16x4_t b) {
3548 vst1_lane_f16(a, b, 3);
3551 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
3552 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3553 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3554 // CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
3555 // CHECK: store float [[TMP3]], ptr %a
3556 // CHECK: ret void
3557 void test_vst1_lane_f32(float32_t *a, float32x2_t b) {
3558 vst1_lane_f32(a, b, 1);
3561 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
3562 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3563 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
3564 // CHECK: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
3565 // CHECK: store double [[TMP3]], ptr %a
3566 // CHECK: ret void
3567 void test_vst1_lane_f64(float64_t *a, float64x1_t b) {
3568 vst1_lane_f64(a, b, 0);
3571 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3572 // CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3573 // CHECK: store i8 [[TMP0]], ptr %a
3574 // CHECK: ret void
3575 void test_vst1_lane_p8(poly8_t *a, poly8x8_t b) {
3576 vst1_lane_p8(a, b, 7);
3579 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3580 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3581 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3582 // CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3583 // CHECK: store i16 [[TMP3]], ptr %a
3584 // CHECK: ret void
3585 void test_vst1_lane_p16(poly16_t *a, poly16x4_t b) {
3586 vst1_lane_p16(a, b, 3);
3589 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3590 // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3591 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3592 // CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3593 // CHECK: store i64 [[TMP3]], ptr %a
3594 // CHECK: ret void
3595 void test_vst1_lane_p64(poly64_t *a, poly64x1_t b) {
3596 vst1_lane_p64(a, b, 0);
3599 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3600 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
3601 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
3602 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
3603 // CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3604 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3605 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3606 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3607 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3608 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3609 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3610 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3611 // CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3612 // CHECK: ret void
3613 void test_vst2q_lane_u8(uint8_t *a, uint8x16x2_t b) {
3614 vst2q_lane_u8(a, b, 15);
3617 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3618 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
3619 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
3620 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
3621 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3622 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3623 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3624 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3625 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3626 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3627 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3628 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3629 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3630 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3631 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3632 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3633 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3634 // CHECK: ret void
3635 void test_vst2q_lane_u16(uint16_t *a, uint16x8x2_t b) {
3636 vst2q_lane_u16(a, b, 7);
3639 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3640 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
3641 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
3642 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
3643 // CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3644 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3645 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3646 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3647 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3648 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3649 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3650 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3651 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3652 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3653 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3654 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3655 // CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3656 // CHECK: ret void
3657 void test_vst2q_lane_u32(uint32_t *a, uint32x4x2_t b) {
3658 vst2q_lane_u32(a, b, 3);
3661 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3662 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
3663 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
3664 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
3665 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3666 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3667 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3668 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3669 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3670 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3671 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3672 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3673 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3674 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3675 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3676 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3677 // CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3678 // CHECK: ret void
3679 void test_vst2q_lane_u64(uint64_t *a, uint64x2x2_t b) {
3680 vst2q_lane_u64(a, b, 1);
3683 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3684 // CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
3685 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
3686 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
3687 // CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3688 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3689 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3690 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3691 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3692 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3693 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3694 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3695 // CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3696 // CHECK: ret void
3697 void test_vst2q_lane_s8(int8_t *a, int8x16x2_t b) {
3698 vst2q_lane_s8(a, b, 15);
3701 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3702 // CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
3703 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
3704 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
3705 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3706 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3707 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3708 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3709 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3710 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3711 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3712 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3713 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3714 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3715 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3716 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3717 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3718 // CHECK: ret void
3719 void test_vst2q_lane_s16(int16_t *a, int16x8x2_t b) {
3720 vst2q_lane_s16(a, b, 7);
3723 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3724 // CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
3725 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
3726 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
3727 // CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3728 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3729 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3730 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3731 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3732 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3733 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3734 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3735 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3736 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3737 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3738 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3739 // CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3740 // CHECK: ret void
3741 void test_vst2q_lane_s32(int32_t *a, int32x4x2_t b) {
3742 vst2q_lane_s32(a, b, 3);
3745 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3746 // CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
3747 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
3748 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
3749 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3750 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3751 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3752 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3753 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3754 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3755 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3756 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3757 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3758 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3759 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3760 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3761 // CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3762 // CHECK: ret void
3763 void test_vst2q_lane_s64(int64_t *a, int64x2x2_t b) {
3764 vst2q_lane_s64(a, b, 1);
3767 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
3768 // CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
3769 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
3770 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
3771 // CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3772 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3773 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3774 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
3775 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
3776 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
3777 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3778 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
3779 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
3780 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
3781 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
3782 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
3783 // CHECK: call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a)
3784 // CHECK: ret void
3785 void test_vst2q_lane_f16(float16_t *a, float16x8x2_t b) {
3786 vst2q_lane_f16(a, b, 7);
3789 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
3790 // CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
3791 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
3792 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
3793 // CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3794 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3795 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3796 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
3797 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
3798 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
3799 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3800 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
3801 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
3802 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
3803 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
3804 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
3805 // CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a)
3806 // CHECK: ret void
3807 void test_vst2q_lane_f32(float32_t *a, float32x4x2_t b) {
3808 vst2q_lane_f32(a, b, 3);
3811 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
3812 // CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
3813 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
3814 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
3815 // CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3816 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3817 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3818 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
3819 // CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
3820 // CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
3821 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3822 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
3823 // CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
3824 // CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
3825 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
3826 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
3827 // CHECK: call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a)
3828 // CHECK: ret void
3829 void test_vst2q_lane_f64(float64_t *a, float64x2x2_t b) {
3830 vst2q_lane_f64(a, b, 1);
3833 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3834 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
3835 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
3836 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
3837 // CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3838 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3839 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3840 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3841 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3842 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3843 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3844 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3845 // CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3846 // CHECK: ret void
3847 void test_vst2q_lane_p8(poly8_t *a, poly8x16x2_t b) {
3848 vst2q_lane_p8(a, b, 15);
3851 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3852 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
3853 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
3854 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
3855 // CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3856 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3857 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3858 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3859 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3860 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3861 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3862 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3863 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3864 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3865 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3866 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3867 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3868 // CHECK: ret void
3869 void test_vst2q_lane_p16(poly16_t *a, poly16x8x2_t b) {
3870 vst2q_lane_p16(a, b, 7);
3873 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3874 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
3875 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
3876 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
3877 // CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3878 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3879 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3880 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3881 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3882 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3883 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3884 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3885 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3886 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3887 // CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3888 // CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3889 // CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3890 // CHECK: ret void
3891 void test_vst2q_lane_p64(poly64_t *a, poly64x2x2_t b) {
3892 vst2q_lane_p64(a, b, 1);
3895 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3896 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
3897 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
3898 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
3899 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3900 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3901 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3902 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3903 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3904 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3905 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3906 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3907 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3908 // CHECK: ret void
3909 void test_vst2_lane_u8(uint8_t *a, uint8x8x2_t b) {
3910 vst2_lane_u8(a, b, 7);
3913 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3914 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
3915 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
3916 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
3917 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3918 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3919 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3920 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3921 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3922 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
3923 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3924 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3925 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3926 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
3927 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
3928 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
3929 // CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
3930 // CHECK: ret void
3931 void test_vst2_lane_u16(uint16_t *a, uint16x4x2_t b) {
3932 vst2_lane_u16(a, b, 3);
3935 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3936 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
3937 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
3938 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
3939 // CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3940 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3941 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3942 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3943 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3944 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
3945 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3946 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3947 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3948 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
3949 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
3950 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
3951 // CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
3952 // CHECK: ret void
3953 void test_vst2_lane_u32(uint32_t *a, uint32x2x2_t b) {
3954 vst2_lane_u32(a, b, 1);
3957 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3958 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
3959 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
3960 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
3961 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3962 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3963 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3964 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3965 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3966 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
3967 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3968 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3969 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3970 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
3971 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
3972 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
3973 // CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
3974 // CHECK: ret void
3975 void test_vst2_lane_u64(uint64_t *a, uint64x1x2_t b) {
3976 vst2_lane_u64(a, b, 0);
3979 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3980 // CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
3981 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
3982 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
3983 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3984 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3985 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3986 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3987 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3988 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3989 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3990 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3991 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3992 // CHECK: ret void
3993 void test_vst2_lane_s8(int8_t *a, int8x8x2_t b) {
3994 vst2_lane_s8(a, b, 7);
3997 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3998 // CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
3999 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
4000 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
4001 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4002 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4003 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4004 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4005 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4006 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4007 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4008 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4009 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4010 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4011 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4012 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4013 // CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4014 // CHECK: ret void
4015 void test_vst2_lane_s16(int16_t *a, int16x4x2_t b) {
4016 vst2_lane_s16(a, b, 3);
4019 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4020 // CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
4021 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
4022 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
4023 // CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4024 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4025 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4026 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4027 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4028 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4029 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4030 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4031 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4032 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4033 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4034 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4035 // CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
4036 // CHECK: ret void
4037 void test_vst2_lane_s32(int32_t *a, int32x2x2_t b) {
4038 vst2_lane_s32(a, b, 1);
4041 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4042 // CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
4043 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
4044 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
4045 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4046 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4047 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4048 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4049 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4050 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4051 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4052 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4053 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4054 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4055 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4056 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4057 // CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4058 // CHECK: ret void
4059 void test_vst2_lane_s64(int64_t *a, int64x1x2_t b) {
4060 vst2_lane_s64(a, b, 0);
4063 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
4064 // CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
4065 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
4066 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
4067 // CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4068 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4069 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4070 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4071 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4072 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4073 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4074 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4075 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4076 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4077 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4078 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4079 // CHECK: call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a)
4080 // CHECK: ret void
4081 void test_vst2_lane_f16(float16_t *a, float16x4x2_t b) {
4082 vst2_lane_f16(a, b, 3);
4085 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
4086 // CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
4087 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
4088 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
4089 // CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4090 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4091 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4092 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4093 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4094 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4095 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4096 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4097 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4098 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4099 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4100 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4101 // CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a)
4102 // CHECK: ret void
4103 void test_vst2_lane_f32(float32_t *a, float32x2x2_t b) {
4104 vst2_lane_f32(a, b, 1);
4107 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
4108 // CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
4109 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
4110 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
4111 // CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4112 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4113 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4114 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4115 // CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4116 // CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4117 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4118 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4119 // CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4120 // CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4121 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4122 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4123 // CHECK: call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a)
4124 // CHECK: ret void
4125 void test_vst2_lane_f64(float64_t *a, float64x1x2_t b) {
4126 vst2_lane_f64(a, b, 0);
4129 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4130 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
4131 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
4132 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
4133 // CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4134 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4135 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4136 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4137 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4138 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4139 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4140 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4141 // CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
4142 // CHECK: ret void
4143 void test_vst2_lane_p8(poly8_t *a, poly8x8x2_t b) {
4144 vst2_lane_p8(a, b, 7);
4147 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4148 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
4149 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
4150 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
4151 // CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4152 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4153 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4154 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4155 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4156 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4157 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4158 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4159 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4160 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4161 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4162 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4163 // CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4164 // CHECK: ret void
4165 void test_vst2_lane_p16(poly16_t *a, poly16x4x2_t b) {
4166 vst2_lane_p16(a, b, 3);
4169 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4170 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
4171 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
4172 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
4173 // CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4174 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4175 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4176 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4177 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4178 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4179 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4180 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4181 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4182 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4183 // CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4184 // CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4185 // CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4186 // CHECK: ret void
4187 void test_vst2_lane_p64(poly64_t *a, poly64x1x2_t b) {
4188 vst2_lane_p64(a, b, 0);
4191 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4192 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
4193 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
4194 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
4195 // CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4196 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4197 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4198 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4199 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4200 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4201 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4202 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4203 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4204 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4205 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4206 // CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4207 // CHECK: ret void
4208 void test_vst3q_lane_u8(uint8_t *a, uint8x16x3_t b) {
4209 vst3q_lane_u8(a, b, 15);
4212 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4213 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
4214 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
4215 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
4216 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4217 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4218 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4219 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4220 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4221 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4222 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4223 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4224 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4225 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4226 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4227 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4228 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4229 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4230 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4231 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4232 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4233 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4234 // CHECK: ret void
4235 void test_vst3q_lane_u16(uint16_t *a, uint16x8x3_t b) {
4236 vst3q_lane_u16(a, b, 7);
4239 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4240 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
4241 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
4242 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
4243 // CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4244 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4245 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4246 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4247 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4248 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4249 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4250 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4251 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4252 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4253 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4254 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4255 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4256 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4257 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4258 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4259 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4260 // CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4261 // CHECK: ret void
4262 void test_vst3q_lane_u32(uint32_t *a, uint32x4x3_t b) {
4263 vst3q_lane_u32(a, b, 3);
4266 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4267 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
4268 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
4269 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
4270 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4271 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4272 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4273 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4274 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4275 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4276 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4277 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4278 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4279 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4280 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4281 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4282 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4283 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4284 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4285 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4286 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4287 // CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4288 // CHECK: ret void
4289 void test_vst3q_lane_u64(uint64_t *a, uint64x2x3_t b) {
4290 vst3q_lane_u64(a, b, 1);
4293 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4294 // CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
4295 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
4296 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
4297 // CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4298 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4299 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4300 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4301 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4302 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4303 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4304 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4305 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4306 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4307 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4308 // CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4309 // CHECK: ret void
4310 void test_vst3q_lane_s8(int8_t *a, int8x16x3_t b) {
4311 vst3q_lane_s8(a, b, 15);
4314 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4315 // CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
4316 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
4317 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
4318 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4319 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4320 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4321 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4322 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4323 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4324 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4325 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4326 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4327 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4328 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4329 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4330 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4331 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4332 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4333 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4334 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4335 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4336 // CHECK: ret void
4337 void test_vst3q_lane_s16(int16_t *a, int16x8x3_t b) {
4338 vst3q_lane_s16(a, b, 7);
4341 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4342 // CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
4343 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
4344 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
4345 // CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4346 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4347 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4348 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4349 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4350 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4351 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4352 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4353 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4354 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4355 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4356 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4357 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4358 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4359 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4360 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4361 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4362 // CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4363 // CHECK: ret void
4364 void test_vst3q_lane_s32(int32_t *a, int32x4x3_t b) {
4365 vst3q_lane_s32(a, b, 3);
4368 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4369 // CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
4370 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
4371 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
4372 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4373 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4374 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4375 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4376 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4377 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4378 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4379 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4380 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4381 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4382 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4383 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4384 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4385 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4386 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4387 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4388 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4389 // CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4390 // CHECK: ret void
4391 void test_vst3q_lane_s64(int64_t *a, int64x2x3_t b) {
4392 vst3q_lane_s64(a, b, 1);
4395 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
4396 // CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
4397 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
4398 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
4399 // CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4400 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4401 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4402 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
4403 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
4404 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
4405 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4406 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
4407 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
4408 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4409 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4410 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
4411 // CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
4412 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
4413 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
4414 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4415 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
4416 // CHECK: call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a)
4417 // CHECK: ret void
4418 void test_vst3q_lane_f16(float16_t *a, float16x8x3_t b) {
4419 vst3q_lane_f16(a, b, 7);
4422 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
4423 // CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
4424 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
4425 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
4426 // CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4427 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4428 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4429 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
4430 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
4431 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
4432 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4433 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
4434 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
4435 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4436 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4437 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
4438 // CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
4439 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
4440 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
4441 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4442 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
4443 // CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a)
4444 // CHECK: ret void
4445 void test_vst3q_lane_f32(float32_t *a, float32x4x3_t b) {
4446 vst3q_lane_f32(a, b, 3);
4449 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
4450 // CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
4451 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
4452 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
4453 // CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4454 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4455 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4456 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
4457 // CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
4458 // CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
4459 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4460 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
4461 // CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
4462 // CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
4463 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4464 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
4465 // CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
4466 // CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
4467 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
4468 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
4469 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
4470 // CHECK: call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a)
4471 // CHECK: ret void
4472 void test_vst3q_lane_f64(float64_t *a, float64x2x3_t b) {
4473 vst3q_lane_f64(a, b, 1);
4476 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4477 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
4478 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
4479 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
4480 // CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4481 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4482 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4483 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4484 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4485 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4486 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4487 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4488 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4489 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4490 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4491 // CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4492 // CHECK: ret void
4493 void test_vst3q_lane_p8(poly8_t *a, poly8x16x3_t b) {
4494 vst3q_lane_p8(a, b, 15);
4497 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4498 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
4499 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
4500 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
4501 // CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4502 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4503 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4504 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4505 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4506 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4507 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4508 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4509 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4510 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4511 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4512 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4513 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4514 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4515 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4516 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4517 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4518 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4519 // CHECK: ret void
4520 void test_vst3q_lane_p16(poly16_t *a, poly16x8x3_t b) {
4521 vst3q_lane_p16(a, b, 7);
4524 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4525 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
4526 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
4527 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
4528 // CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4529 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4530 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4531 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4532 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4533 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4534 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4535 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4536 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4537 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4538 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4539 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4540 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4541 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4542 // CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4543 // CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4544 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4545 // CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4546 // CHECK: ret void
4547 void test_vst3q_lane_p64(poly64_t *a, poly64x2x3_t b) {
4548 vst3q_lane_p64(a, b, 1);
4551 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4552 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
4553 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
4554 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
4555 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4556 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4557 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4558 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4559 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4560 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4561 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4562 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4563 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4564 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4565 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4566 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4567 // CHECK: ret void
4568 void test_vst3_lane_u8(uint8_t *a, uint8x8x3_t b) {
4569 vst3_lane_u8(a, b, 7);
4572 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4573 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
4574 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
4575 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
4576 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4577 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4578 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4579 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4580 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4581 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4582 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4583 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4584 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4585 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4586 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4587 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4588 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4589 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4590 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4591 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4592 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4593 // CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4594 // CHECK: ret void
4595 void test_vst3_lane_u16(uint16_t *a, uint16x4x3_t b) {
4596 vst3_lane_u16(a, b, 3);
4599 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4600 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
4601 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
4602 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
4603 // CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4604 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4605 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4606 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4607 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4608 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4609 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4610 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4611 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4612 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4613 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4614 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4615 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4616 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4617 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4618 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4619 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4620 // CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4621 // CHECK: ret void
4622 void test_vst3_lane_u32(uint32_t *a, uint32x2x3_t b) {
4623 vst3_lane_u32(a, b, 1);
4626 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4627 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
4628 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
4629 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
4630 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4631 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4632 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4633 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4634 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4635 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4636 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4637 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4638 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4639 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4640 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4641 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4642 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4643 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4644 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4645 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4646 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4647 // CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4648 // CHECK: ret void
4649 void test_vst3_lane_u64(uint64_t *a, uint64x1x3_t b) {
4650 vst3_lane_u64(a, b, 0);
4653 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4654 // CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
4655 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
4656 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
4657 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4658 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4659 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4660 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4661 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4662 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4663 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4664 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4665 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4666 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4667 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4668 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4669 // CHECK: ret void
4670 void test_vst3_lane_s8(int8_t *a, int8x8x3_t b) {
4671 vst3_lane_s8(a, b, 7);
4674 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4675 // CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
4676 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
4677 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
4678 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4679 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4680 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4681 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4682 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4683 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4684 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4685 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4686 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4687 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4688 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4689 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4690 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4691 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4692 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4693 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4694 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4695 // CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4696 // CHECK: ret void
4697 void test_vst3_lane_s16(int16_t *a, int16x4x3_t b) {
4698 vst3_lane_s16(a, b, 3);
4701 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4702 // CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
4703 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
4704 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
4705 // CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4706 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4707 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4708 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4709 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4710 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4711 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4712 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4713 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4714 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4715 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4716 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4717 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4718 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4719 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4720 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4721 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4722 // CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4723 // CHECK: ret void
4724 void test_vst3_lane_s32(int32_t *a, int32x2x3_t b) {
4725 vst3_lane_s32(a, b, 1);
4728 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4729 // CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
4730 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
4731 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
4732 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4733 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4734 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4735 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4736 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4737 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4738 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4739 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4740 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4741 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4742 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4743 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4744 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4745 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4746 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4747 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4748 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4749 // CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4750 // CHECK: ret void
4751 void test_vst3_lane_s64(int64_t *a, int64x1x3_t b) {
4752 vst3_lane_s64(a, b, 0);
4755 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
4756 // CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
4757 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
4758 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
4759 // CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4760 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4761 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4762 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4763 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4764 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4765 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4766 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4767 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4768 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4769 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4770 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
4771 // CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
4772 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
4773 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4774 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4775 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
4776 // CHECK: call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a)
4777 // CHECK: ret void
4778 void test_vst3_lane_f16(float16_t *a, float16x4x3_t b) {
4779 vst3_lane_f16(a, b, 3);
4782 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
4783 // CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
4784 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
4785 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
4786 // CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4787 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4788 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4789 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4790 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4791 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4792 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4793 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4794 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4795 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4796 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4797 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
4798 // CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
4799 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
4800 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4801 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4802 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
4803 // CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a)
4804 // CHECK: ret void
4805 void test_vst3_lane_f32(float32_t *a, float32x2x3_t b) {
4806 vst3_lane_f32(a, b, 1);
4809 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
4810 // CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
4811 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
4812 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
4813 // CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4814 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4815 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4816 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4817 // CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4818 // CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4819 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4820 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4821 // CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4822 // CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4823 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4824 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
4825 // CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
4826 // CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
4827 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4828 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4829 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
4830 // CHECK: call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a)
4831 // CHECK: ret void
4832 void test_vst3_lane_f64(float64_t *a, float64x1x3_t b) {
4833 vst3_lane_f64(a, b, 0);
4836 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4837 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
4838 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
4839 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
4840 // CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4841 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4842 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4843 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4844 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4845 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4846 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4847 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4848 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4849 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4850 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4851 // CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4852 // CHECK: ret void
4853 void test_vst3_lane_p8(poly8_t *a, poly8x8x3_t b) {
4854 vst3_lane_p8(a, b, 7);
4857 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4858 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
4859 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
4860 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
4861 // CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4862 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4863 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4864 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4865 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4866 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4867 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4868 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4869 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4870 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4871 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4872 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4873 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4874 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4875 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4876 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4877 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4878 // CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4879 // CHECK: ret void
4880 void test_vst3_lane_p16(poly16_t *a, poly16x4x3_t b) {
4881 vst3_lane_p16(a, b, 3);
4884 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4885 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
4886 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
4887 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
4888 // CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4889 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4890 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4891 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4892 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4893 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4894 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4895 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4896 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4897 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4898 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4899 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4900 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4901 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4902 // CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4903 // CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4904 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4905 // CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4906 // CHECK: ret void
4907 void test_vst3_lane_p64(poly64_t *a, poly64x1x3_t b) {
4908 vst3_lane_p64(a, b, 0);
4911 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4912 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
4913 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
4914 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
4915 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4916 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4917 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4918 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4919 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4920 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4921 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4922 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4923 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4924 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4925 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4926 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4927 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
4928 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
4929 // CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
4930 // CHECK: ret void
4931 void test_vst4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
4932 vst4q_lane_u8(a, b, 15);
4935 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4936 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
4937 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
4938 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
4939 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4940 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4941 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4942 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4943 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4944 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4945 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4946 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4947 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4948 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4949 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4950 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4951 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4952 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4953 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4954 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
4955 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
4956 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
4957 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4958 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4959 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4960 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
4961 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
4962 // CHECK: ret void
4963 void test_vst4q_lane_u16(uint16_t *a, uint16x8x4_t b) {
4964 vst4q_lane_u16(a, b, 7);
4967 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4968 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
4969 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
4970 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
4971 // CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4972 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4973 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4974 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4975 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4976 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4977 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4978 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4979 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4980 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4981 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4982 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4983 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4984 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4985 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4986 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
4987 // CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
4988 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
4989 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4990 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4991 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4992 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
4993 // CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
4994 // CHECK: ret void
4995 void test_vst4q_lane_u32(uint32_t *a, uint32x4x4_t b) {
4996 vst4q_lane_u32(a, b, 3);
4999 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5000 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
5001 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
5002 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
5003 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5004 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5005 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5006 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5007 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5008 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5009 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5010 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5011 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5012 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5013 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5014 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5015 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5016 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5017 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5018 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5019 // CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5020 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5021 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5022 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5023 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5024 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5025 // CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5026 // CHECK: ret void
5027 void test_vst4q_lane_u64(uint64_t *a, uint64x2x4_t b) {
5028 vst4q_lane_u64(a, b, 1);
5031 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5032 // CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
5033 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
5034 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
5035 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5036 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5037 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5038 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5039 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5040 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5041 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5042 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5043 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5044 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5045 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5046 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5047 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5048 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5049 // CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5050 // CHECK: ret void
5051 void test_vst4q_lane_s8(int8_t *a, int8x16x4_t b) {
5052 vst4q_lane_s8(a, b, 15);
5055 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5056 // CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
5057 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
5058 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
5059 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5060 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5061 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5062 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5063 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5064 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5065 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5066 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5067 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5068 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5069 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5070 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5071 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5072 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5073 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5074 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5075 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5076 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5077 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5078 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5079 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5080 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5081 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5082 // CHECK: ret void
5083 void test_vst4q_lane_s16(int16_t *a, int16x8x4_t b) {
5084 vst4q_lane_s16(a, b, 7);
5087 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
5088 // CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
5089 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
5090 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
5091 // CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5092 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5093 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5094 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
5095 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5096 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5097 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5098 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
5099 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5100 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5101 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5102 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
5103 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5104 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5105 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5106 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
5107 // CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
5108 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5109 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5110 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5111 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5112 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5113 // CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
5114 // CHECK: ret void
5115 void test_vst4q_lane_s32(int32_t *a, int32x4x4_t b) {
5116 vst4q_lane_s32(a, b, 3);
5119 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5120 // CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
5121 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
5122 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
5123 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5124 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5125 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5126 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5127 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5128 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5129 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5130 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5131 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5132 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5133 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5134 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5135 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5136 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5137 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5138 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5139 // CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5140 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5141 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5142 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5143 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5144 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5145 // CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5146 // CHECK: ret void
5147 void test_vst4q_lane_s64(int64_t *a, int64x2x4_t b) {
5148 vst4q_lane_s64(a, b, 1);
5151 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
5152 // CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
5153 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
5154 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
5155 // CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5156 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5157 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5158 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
5159 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
5160 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
5161 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5162 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
5163 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
5164 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5165 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5166 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
5167 // CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
5168 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5169 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5170 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
5171 // CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
5172 // CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5173 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
5174 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5175 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5176 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5177 // CHECK: call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a)
5178 // CHECK: ret void
5179 void test_vst4q_lane_f16(float16_t *a, float16x8x4_t b) {
5180 vst4q_lane_f16(a, b, 7);
5183 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
5184 // CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
5185 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
5186 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
5187 // CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5188 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5189 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5190 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
5191 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
5192 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
5193 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5194 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
5195 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
5196 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5197 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5198 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
5199 // CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
5200 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5201 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5202 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
5203 // CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
5204 // CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5205 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
5206 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5207 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5208 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5209 // CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a)
5210 // CHECK: ret void
5211 void test_vst4q_lane_f32(float32_t *a, float32x4x4_t b) {
5212 vst4q_lane_f32(a, b, 3);
5215 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
5216 // CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
5217 // CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
5218 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
5219 // CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5220 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5221 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5222 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
5223 // CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
5224 // CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
5225 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5226 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
5227 // CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
5228 // CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
5229 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5230 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
5231 // CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
5232 // CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
5233 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5234 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
5235 // CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
5236 // CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
5237 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
5238 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
5239 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
5240 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
5241 // CHECK: call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a)
5242 // CHECK: ret void
5243 void test_vst4q_lane_f64(float64_t *a, float64x2x4_t b) {
5244 vst4q_lane_f64(a, b, 1);
5247 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5248 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
5249 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
5250 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
5251 // CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5252 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5253 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5254 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5255 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5256 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5257 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5258 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5259 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5260 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5261 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5262 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5263 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5264 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5265 // CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5266 // CHECK: ret void
5267 void test_vst4q_lane_p8(poly8_t *a, poly8x16x4_t b) {
5268 vst4q_lane_p8(a, b, 15);
5271 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5272 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
5273 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
5274 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
5275 // CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5276 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5277 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5278 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5279 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5280 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5281 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5282 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5283 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5284 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5285 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5286 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5287 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5288 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5289 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5290 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5291 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5292 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5293 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5294 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5295 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5296 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5297 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5298 // CHECK: ret void
5299 void test_vst4q_lane_p16(poly16_t *a, poly16x8x4_t b) {
5300 vst4q_lane_p16(a, b, 7);
5303 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5304 // CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
5305 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
5306 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
5307 // CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5308 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5309 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5310 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5311 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5312 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5313 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5314 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5315 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5316 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5317 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5318 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5319 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5320 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5321 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5322 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5323 // CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5324 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5325 // CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5326 // CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5327 // CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5328 // CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5329 // CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5330 // CHECK: ret void
5331 void test_vst4q_lane_p64(poly64_t *a, poly64x2x4_t b) {
5332 vst4q_lane_p64(a, b, 1);
5335 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5336 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
5337 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
5338 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
5339 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5340 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5341 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5342 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5343 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5344 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5345 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5346 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5347 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5348 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5349 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5350 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5351 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5352 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5353 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5354 // CHECK: ret void
5355 void test_vst4_lane_u8(uint8_t *a, uint8x8x4_t b) {
5356 vst4_lane_u8(a, b, 7);
5359 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5360 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
5361 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
5362 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
5363 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5364 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5365 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5366 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5367 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5368 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5369 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5370 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5371 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5372 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5373 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5374 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5375 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5376 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5377 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5378 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5379 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5380 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5381 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5382 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5383 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5384 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5385 // CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5386 // CHECK: ret void
5387 void test_vst4_lane_u16(uint16_t *a, uint16x4x4_t b) {
5388 vst4_lane_u16(a, b, 3);
5391 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5392 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
5393 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
5394 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
5395 // CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5396 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5397 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5398 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5399 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5400 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5401 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5402 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5403 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5404 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5405 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5406 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5407 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5408 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5409 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5410 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5411 // CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5412 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5413 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5414 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5415 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5416 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5417 // CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5418 // CHECK: ret void
5419 void test_vst4_lane_u32(uint32_t *a, uint32x2x4_t b) {
5420 vst4_lane_u32(a, b, 1);
5423 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5424 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
5425 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
5426 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
5427 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5428 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5429 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5430 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5431 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5432 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5433 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5434 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5435 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5436 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5437 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5438 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5439 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5440 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5441 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5442 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5443 // CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5444 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5445 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5446 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5447 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5448 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5449 // CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5450 // CHECK: ret void
5451 void test_vst4_lane_u64(uint64_t *a, uint64x1x4_t b) {
5452 vst4_lane_u64(a, b, 0);
5455 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5456 // CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
5457 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
5458 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
5459 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5460 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5461 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5462 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5463 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5464 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5465 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5466 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5467 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5468 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5469 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5470 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5471 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5472 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5473 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5474 // CHECK: ret void
5475 void test_vst4_lane_s8(int8_t *a, int8x8x4_t b) {
5476 vst4_lane_s8(a, b, 7);
5479 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5480 // CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
5481 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
5482 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
5483 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5484 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5485 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5486 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5487 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5488 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5489 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5490 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5491 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5492 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5493 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5494 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5495 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5496 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5497 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5498 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5499 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5500 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5501 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5502 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5503 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5504 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5505 // CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5506 // CHECK: ret void
5507 void test_vst4_lane_s16(int16_t *a, int16x4x4_t b) {
5508 vst4_lane_s16(a, b, 3);
5511 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5512 // CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
5513 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
5514 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
5515 // CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5516 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5517 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5518 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5519 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5520 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5521 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5522 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5523 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5524 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5525 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5526 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5527 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5528 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5529 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5530 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5531 // CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5532 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5533 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5534 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5535 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5536 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5537 // CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5538 // CHECK: ret void
5539 void test_vst4_lane_s32(int32_t *a, int32x2x4_t b) {
5540 vst4_lane_s32(a, b, 1);
5543 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5544 // CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
5545 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
5546 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
5547 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5548 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5549 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5550 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5551 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5552 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5553 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5554 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5555 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5556 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5557 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5558 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5559 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5560 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5561 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5562 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5563 // CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5564 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5565 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5566 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5567 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5568 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5569 // CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5570 // CHECK: ret void
5571 void test_vst4_lane_s64(int64_t *a, int64x1x4_t b) {
5572 vst4_lane_s64(a, b, 0);
5575 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
5576 // CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
5577 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
5578 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
5579 // CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5580 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5581 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5582 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
5583 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
5584 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
5585 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5586 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
5587 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
5588 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5589 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5590 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
5591 // CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
5592 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5593 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5594 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
5595 // CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
5596 // CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
5597 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
5598 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5599 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5600 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
5601 // CHECK: call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a)
5602 // CHECK: ret void
5603 void test_vst4_lane_f16(float16_t *a, float16x4x4_t b) {
5604 vst4_lane_f16(a, b, 3);
5607 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
5608 // CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
5609 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
5610 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
5611 // CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5612 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5613 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5614 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
5615 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
5616 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
5617 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5618 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
5619 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
5620 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5621 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5622 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
5623 // CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
5624 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5625 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5626 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
5627 // CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
5628 // CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
5629 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
5630 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5631 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5632 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
5633 // CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a)
5634 // CHECK: ret void
5635 void test_vst4_lane_f32(float32_t *a, float32x2x4_t b) {
5636 vst4_lane_f32(a, b, 1);
5639 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
5640 // CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
5641 // CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
5642 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
5643 // CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5644 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5645 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5646 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
5647 // CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
5648 // CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
5649 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5650 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
5651 // CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
5652 // CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
5653 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5654 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
5655 // CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
5656 // CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
5657 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5658 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
5659 // CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
5660 // CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
5661 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
5662 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
5663 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
5664 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
5665 // CHECK: call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a)
5666 // CHECK: ret void
5667 void test_vst4_lane_f64(float64_t *a, float64x1x4_t b) {
5668 vst4_lane_f64(a, b, 0);
5671 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5672 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
5673 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
5674 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
5675 // CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5676 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5677 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5678 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5679 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5680 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5681 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5682 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5683 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5684 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5685 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5686 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5687 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5688 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5689 // CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5690 // CHECK: ret void
5691 void test_vst4_lane_p8(poly8_t *a, poly8x8x4_t b) {
5692 vst4_lane_p8(a, b, 7);
5695 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5696 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
5697 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
5698 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
5699 // CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5700 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5701 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5702 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5703 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5704 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5705 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5706 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5707 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5708 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5709 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5710 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5711 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5712 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5713 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5714 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5715 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5716 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5717 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5718 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5719 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5720 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5721 // CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5722 // CHECK: ret void
5723 void test_vst4_lane_p16(poly16_t *a, poly16x4x4_t b) {
5724 vst4_lane_p16(a, b, 3);
5727 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5728 // CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
5729 // CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
5730 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
5731 // CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5732 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5733 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5734 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5735 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5736 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5737 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5738 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5739 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5740 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5741 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5742 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5743 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5744 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5745 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5746 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5747 // CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5748 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5749 // CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5750 // CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5751 // CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5752 // CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5753 // CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5754 // CHECK: ret void
5755 void test_vst4_lane_p64(poly64_t *a, poly64x1x4_t b) {
5756 vst4_lane_p64(a, b, 0);