clang/test/CodeGen/AArch64/neon-ldst-one.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
   2 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
   3 // RUN: | opt -S -passes=mem2reg | FileCheck %s
   4
   5 // REQUIRES: aarch64-registered-target || arm-registered-target
   6
   7 #include <arm_neon.h>
   8
   9 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 {
  10 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
  11 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
  12 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
  13 // CHECK:   ret <16 x i8> [[LANE]]
  14 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
  15   return vld1q_dup_u8(a);
  16 }
  17
  18 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 {
  19 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
  20 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
  21 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
  22 // CHECK:   ret <8 x i16> [[LANE]]
  23 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
  24   return vld1q_dup_u16(a);
  25 }
  26
  27 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 {
  28 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
  29 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
  30 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
  31 // CHECK:   ret <4 x i32> [[LANE]]
  32 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
  33   return vld1q_dup_u32(a);
  34 }
  35
  36 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 {
  37 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
  38 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
  39 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
  40 // CHECK:   ret <2 x i64> [[LANE]]
  41 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
  42   return vld1q_dup_u64(a);
  43 }
  44
  45 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 {
  46 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
  47 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
  48 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
  49 // CHECK:   ret <16 x i8> [[LANE]]
  50 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
  51   return vld1q_dup_s8(a);
  52 }
  53
  54 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 {
  55 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
  56 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
  57 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
  58 // CHECK:   ret <8 x i16> [[LANE]]
  59 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
  60   return vld1q_dup_s16(a);
  61 }
  62
  63 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 {
  64 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
  65 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
  66 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
  67 // CHECK:   ret <4 x i32> [[LANE]]
  68 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
  69   return vld1q_dup_s32(a);
  70 }
  71
  72 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 {
  73 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
  74 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
  75 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
  76 // CHECK:   ret <2 x i64> [[LANE]]
  77 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
  78   return vld1q_dup_s64(a);
  79 }
  80
  81 // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 {
  82 // CHECK:   [[TMP2:%.*]] = load half, ptr %a
  83 // CHECK:   [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
  84 // CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
  85 // CHECK:   ret <8 x half> [[LANE]]
  86 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
  87   return vld1q_dup_f16(a);
  88 }
  89
  90 // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 {
  91 // CHECK:   [[TMP2:%.*]] = load float, ptr %a
  92 // CHECK:   [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
  93 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
  94 // CHECK:   ret <4 x float> [[LANE]]
  95 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
  96   return vld1q_dup_f32(a);
  97 }
  98
  99 // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 {
 100 // CHECK:   [[TMP2:%.*]] = load double, ptr %a
 101 // CHECK:   [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
 102 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
 103 // CHECK:   ret <2 x double> [[LANE]]
 104 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
 105   return vld1q_dup_f64(a);
 106 }
 107
 108 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 {
 109 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 110 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
 111 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
 112 // CHECK:   ret <16 x i8> [[LANE]]
 113 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
 114   return vld1q_dup_p8(a);
 115 }
 116
 117 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 {
 118 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
 119 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
 120 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
 121 // CHECK:   ret <8 x i16> [[LANE]]
 122 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
 123   return vld1q_dup_p16(a);
 124 }
 125
 126 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 {
 127 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
 128 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
 129 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
 130 // CHECK:   ret <2 x i64> [[LANE]]
 131 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
 132   return vld1q_dup_p64(a);
 133 }
 134
 135 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 {
 136 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 137 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
 138 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
 139 // CHECK:   ret <8 x i8> [[LANE]]
 140 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
 141   return vld1_dup_u8(a);
 142 }
 143
 144 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 {
 145 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
 146 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
 147 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
 148 // CHECK:   ret <4 x i16> [[LANE]]
 149 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
 150   return vld1_dup_u16(a);
 151 }
 152
 153 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 {
 154 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
 155 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
 156 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
 157 // CHECK:   ret <2 x i32> [[LANE]]
 158 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
 159   return vld1_dup_u32(a);
 160 }
 161
 162 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 {
 163 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
 164 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
 165 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
 166 // CHECK:   ret <1 x i64> [[LANE]]
 167 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
 168   return vld1_dup_u64(a);
 169 }
 170
 171 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 {
 172 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 173 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
 174 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
 175 // CHECK:   ret <8 x i8> [[LANE]]
 176 int8x8_t test_vld1_dup_s8(int8_t  *a) {
 177   return vld1_dup_s8(a);
 178 }
 179
 180 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 {
 181 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
 182 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
 183 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
 184 // CHECK:   ret <4 x i16> [[LANE]]
 185 int16x4_t test_vld1_dup_s16(int16_t  *a) {
 186   return vld1_dup_s16(a);
 187 }
 188
 189 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 {
 190 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a
 191 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
 192 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
 193 // CHECK:   ret <2 x i32> [[LANE]]
 194 int32x2_t test_vld1_dup_s32(int32_t  *a) {
 195   return vld1_dup_s32(a);
 196 }
 197
 198 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 {
 199 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
 200 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
 201 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
 202 // CHECK:   ret <1 x i64> [[LANE]]
 203 int64x1_t test_vld1_dup_s64(int64_t  *a) {
 204   return vld1_dup_s64(a);
 205 }
 206
 207 // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 {
 208 // CHECK:   [[TMP2:%.*]] = load half, ptr %a
 209 // CHECK:   [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
 210 // CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
 211 // CHECK:   ret <4 x half> [[LANE]]
 212 float16x4_t test_vld1_dup_f16(float16_t  *a) {
 213   return vld1_dup_f16(a);
 214 }
 215
 216 // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 {
 217 // CHECK:   [[TMP2:%.*]] = load float, ptr %a
 218 // CHECK:   [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
 219 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
 220 // CHECK:   ret <2 x float> [[LANE]]
 221 float32x2_t test_vld1_dup_f32(float32_t  *a) {
 222   return vld1_dup_f32(a);
 223 }
 224
 225 // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 {
 226 // CHECK:   [[TMP2:%.*]] = load double, ptr %a
 227 // CHECK:   [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
 228 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
 229 // CHECK:   ret <1 x double> [[LANE]]
 230 float64x1_t test_vld1_dup_f64(float64_t  *a) {
 231   return vld1_dup_f64(a);
 232 }
 233
 234 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 {
 235 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 236 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
 237 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
 238 // CHECK:   ret <8 x i8> [[LANE]]
 239 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
 240   return vld1_dup_p8(a);
 241 }
 242
 243 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 {
 244 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a
 245 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
 246 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
 247 // CHECK:   ret <4 x i16> [[LANE]]
 248 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
 249   return vld1_dup_p16(a);
 250 }
 251
 252 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 {
 253 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a
 254 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
 255 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
 256 // CHECK:   ret <1 x i64> [[LANE]]
 257 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
 258   return vld1_dup_p64(a);
 259 }
 260
 261 // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 {
 262 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
 263 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
 264 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
 265 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
 266 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 267 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
 268 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 269 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
 270   return vld2q_dup_u64(a);
 271 }
 272
 273 // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 {
 274 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
 275 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
 276 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
 277 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
 278 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 279 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
 280 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 281 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
 282   return vld2q_dup_s64(a);
 283 }
 284
 285 // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 {
 286 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
 287 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
 288 // CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a)
 289 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]]
 290 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 291 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
 292 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 293 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
 294   return vld2q_dup_f64(a);
 295 }
 296
 297 // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 {
 298 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
 299 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
 300 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
 301 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
 302 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 303 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
 304 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 305 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
 306   return vld2q_dup_p64(a);
 307 }
 308
 309 // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 {
 310 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
 311 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
 312 // CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a)
 313 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]]
 314 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
 315 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
 316 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 317 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
 318   return vld2_dup_f64(a);
 319 }
 320
 321 // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 {
 322 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
 323 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
 324 // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a)
 325 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
 326 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
 327 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
 328 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 329 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
 330   return vld2_dup_p64(a);
 331 }
 332
 333 // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 {
 334 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
 335 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
 336 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
 337 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
 338 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 339 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
 340 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 341 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
 342   return vld3q_dup_u64(a);
 343   // [{{x[0-9]+|sp}}]
 344 }
 345
 346 // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 {
 347 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
 348 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
 349 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
 350 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
 351 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 352 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
 353 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 354 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
 355   return vld3q_dup_s64(a);
 356   // [{{x[0-9]+|sp}}]
 357 }
 358
 359 // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 {
 360 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
 361 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
 362 // CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a)
 363 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]]
 364 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 365 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
 366 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 367 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
 368   return vld3q_dup_f64(a);
 369   // [{{x[0-9]+|sp}}]
 370 }
 371
 372 // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 {
 373 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
 374 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
 375 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
 376 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
 377 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 378 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
 379 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 380 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
 381   return vld3q_dup_p64(a);
 382   // [{{x[0-9]+|sp}}]
 383 }
 384
 385 // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 {
 386 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
 387 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
 388 // CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a)
 389 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]]
 390 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
 391 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
 392 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 393 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
 394   return vld3_dup_f64(a);
 395   // [{{x[0-9]+|sp}}]
 396 }
 397
 398 // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 {
 399 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
 400 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
 401 // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a)
 402 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
 403 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
 404 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
 405 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 406 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
 407   return vld3_dup_p64(a);
 408   // [{{x[0-9]+|sp}}]
 409 }
 410
 411 // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 {
 412 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
 413 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
 414 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
 415 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
 416 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
 417 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
 418 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 419 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
 420   return vld4q_dup_u64(a);
 421 }
 422
 423 // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 {
 424 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
 425 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
 426 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
 427 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
 428 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
 429 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
 430 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 431 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
 432   return vld4q_dup_s64(a);
 433 }
 434
 435 // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 {
 436 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
 437 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
 438 // CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a)
 439 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]]
 440 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
 441 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
 442 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 443 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
 444   return vld4q_dup_f64(a);
 445 }
 446
 447 // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 {
 448 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
 449 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
 450 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
 451 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
 452 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
 453 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
 454 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 455 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
 456   return vld4q_dup_p64(a);
 457 }
 458
 459 // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 {
 460 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
 461 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
 462 // CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a)
 463 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]]
 464 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
 465 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
 466 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 467 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
 468   return vld4_dup_f64(a);
 469 }
 470
 471 // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 {
 472 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
 473 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
 474 // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a)
 475 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
 476 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
 477 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
 478 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 479 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
 480   return vld4_dup_p64(a);
 481 }
 482
 483 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
 484 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 485 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 486 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
 487 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
 488   return vld1q_lane_u8(a, b, 15);
 489 }
 490
 491 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
 492 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 493 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
 494 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 495 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
 496 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
 497 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
 498   return vld1q_lane_u16(a, b, 7);
 499 }
 500
 501 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
 502 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 503 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 504 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
 505 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
 506 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
 507 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
 508   return vld1q_lane_u32(a, b, 3);
 509 }
 510
 511 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
 512 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 513 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 514 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 515 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
 516 // CHECK:   ret <2 x i64> [[VLD1_LANE]]
 517 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
 518   return vld1q_lane_u64(a, b, 1);
 519 }
 520
 521 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
 522 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 523 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 524 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
 525 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
 526   return vld1q_lane_s8(a, b, 15);
 527 }
 528
 529 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
 530 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 531 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
 532 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 533 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
 534 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
 535 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
 536   return vld1q_lane_s16(a, b, 7);
 537 }
 538
 539 // CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
 540 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 541 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 542 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
 543 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
 544 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
 545 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
 546   return vld1q_lane_s32(a, b, 3);
 547 }
 548
 549 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
 550 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 551 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 552 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 553 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
 554 // CHECK:   ret <2 x i64> [[VLD1_LANE]]
 555 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
 556   return vld1q_lane_s64(a, b, 1);
 557 }
 558
 559 // CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
 560 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
 561 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
 562 // CHECK:   [[TMP4:%.*]] = load half, ptr %a
 563 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
 564 // CHECK:   ret <8 x half> [[VLD1_LANE]]
 565 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
 566   return vld1q_lane_f16(a, b, 7);
 567 }
 568
 569 // CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
 570 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 571 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 572 // CHECK:   [[TMP4:%.*]] = load float, ptr %a
 573 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
 574 // CHECK:   ret <4 x float> [[VLD1_LANE]]
 575 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
 576   return vld1q_lane_f32(a, b, 3);
 577 }
 578
 579 // CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
 580 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 581 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 582 // CHECK:   [[TMP4:%.*]] = load double, ptr %a
 583 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
 584 // CHECK:   ret <2 x double> [[VLD1_LANE]]
 585 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
 586   return vld1q_lane_f64(a, b, 1);
 587 }
 588
 589 // CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
 590 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 591 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
 592 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
 593 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
 594   return vld1q_lane_p8(a, b, 15);
 595 }
 596
 597 // CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
 598 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 599 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
 600 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 601 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
 602 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
 603 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
 604   return vld1q_lane_p16(a, b, 7);
 605 }
 606
 607 // CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
 608 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 609 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 610 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 611 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
 612 // CHECK:   ret <2 x i64> [[VLD1_LANE]]
 613 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
 614   return vld1q_lane_p64(a, b, 1);
 615 }
 616
 617 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
 618 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 619 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 620 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
 621 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
 622   return vld1_lane_u8(a, b, 7);
 623 }
 624
 625 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
 626 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 627 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
 628 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 629 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
 630 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
 631 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
 632   return vld1_lane_u16(a, b, 3);
 633 }
 634
 635 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
 636 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 637 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 638 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
 639 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
 640 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
 641 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
 642   return vld1_lane_u32(a, b, 1);
 643 }
 644
 645 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
 646 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 647 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 648 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 649 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
 650 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
 651 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
 652   return vld1_lane_u64(a, b, 0);
 653 }
 654
 655 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
 656 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 657 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 658 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
 659 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
 660   return vld1_lane_s8(a, b, 7);
 661 }
 662
 663 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
 664 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 665 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
 666 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 667 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
 668 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
 669 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
 670   return vld1_lane_s16(a, b, 3);
 671 }
 672
 673 // CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
 674 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 675 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
 676 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a
 677 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
 678 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
 679 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
 680   return vld1_lane_s32(a, b, 1);
 681 }
 682
 683 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
 684 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 685 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 686 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 687 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
 688 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
 689 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
 690   return vld1_lane_s64(a, b, 0);
 691 }
 692
 693 // CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
 694 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
 695 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
 696 // CHECK:   [[TMP4:%.*]] = load half, ptr %a
 697 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
 698 // CHECK:   ret <4 x half> [[VLD1_LANE]]
 699 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
 700   return vld1_lane_f16(a, b, 3);
 701 }
 702
 703 // CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
 704 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 705 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 706 // CHECK:   [[TMP4:%.*]] = load float, ptr %a
 707 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
 708 // CHECK:   ret <2 x float> [[VLD1_LANE]]
 709 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
 710   return vld1_lane_f32(a, b, 1);
 711 }
 712
 713 // CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
 714 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
 715 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
 716 // CHECK:   [[TMP4:%.*]] = load double, ptr %a
 717 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
 718 // CHECK:   ret <1 x double> [[VLD1_LANE]]
 719 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
 720   return vld1_lane_f64(a, b, 0);
 721 }
 722
 723 // CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
 724 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a
 725 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
 726 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
 727 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
 728   return vld1_lane_p8(a, b, 7);
 729 }
 730
 731 // CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
 732 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 733 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
 734 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a
 735 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
 736 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
 737 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
 738   return vld1_lane_p16(a, b, 3);
 739 }
 740
 741 // CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
 742 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
 743 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
 744 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a
 745 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
 746 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
 747 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
 748   return vld1_lane_p64(a, b, 0);
 749 }
 750
 751 // CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
 752 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
 753 // CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
 754 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
 755 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
 756 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0
 757 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
 758 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
 759 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
 760 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
 761 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
 762 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
 763 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
 764 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
 765 // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
 766 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
 767 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 768 // CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
 769 // CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 770 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
 771   return vld2q_lane_s8(ptr, src, 15);
 772 }
 773
 774 // CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
 775 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
 776 // CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
 777 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
 778 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
 779 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0
 780 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
 781 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
 782 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
 783 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
 784 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
 785 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
 786 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
 787 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
 788 // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
 789 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
 790 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 791 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
 792 // CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 793 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
 794   return vld2q_lane_u8(ptr, src, 15);
 795 }
 796
 797 // CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
 798 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
 799 // CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
 800 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
 801 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
 802 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0
 803 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
 804 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
 805 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
 806 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
 807 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
 808 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
 809 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
 810 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
 811 // CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
 812 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
 813 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 814 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
 815 // CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 816 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
 817   return vld2q_lane_p8(ptr, src, 15);
 818 }
 819
 820 // CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
 821 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
 822 // CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
 823 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
 824 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
 825 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0
 826 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
 827 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
 828 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
 829 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
 830 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
 831 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
 832 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
 833 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
 834 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
 835 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
 836 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
 837 // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
 838 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
 839 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 840 // CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16
 841 // CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 842 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
 843   return vld3q_lane_s8(ptr, src, 15);
 844 }
 845
 846 // CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
 847 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
 848 // CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
 849 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
 850 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
 851 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0
 852 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
 853 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
 854 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
 855 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
 856 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
 857 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
 858 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
 859 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
 860 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
 861 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
 862 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
 863 // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
 864 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
 865 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
 866 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16
 867 // CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 868 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
 869   return vld3q_lane_u8(ptr, src, 15);
 870 }
 871
 872 // CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
 873 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
 874 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 875 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
 876 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
 877 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
 878 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
 879 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
 880 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
 881 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
 882 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
 883 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 884 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
 885 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
 886 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
 887 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 888 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 889 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
 890 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
 891 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
 892 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 893 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
 894 // CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
 895 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 896   return vld2q_lane_u16(a, b, 7);
 897 }
 898
 899 // CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
 900 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
 901 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
 902 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
 903 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
 904 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
 905 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
 906 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
 907 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
 908 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
 909 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
 910 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 911 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
 912 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
 913 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
 914 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 915 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 916 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 917 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
 918 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
 919 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 920 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
 921 // CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
 922 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 923   return vld2q_lane_u32(a, b, 3);
 924 }
 925
 926 // CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
 927 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
 928 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
 929 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
 930 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
 931 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
 932 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
 933 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
 934 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
 935 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
 936 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
 937 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
 938 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
 939 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
 940 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
 941 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
 942 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
 943 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
 944 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
 945 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
 946 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 947 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
 948 // CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
 949 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 950   return vld2q_lane_u64(a, b, 1);
 951 }
 952
 953 // CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
 954 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
 955 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
 956 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
 957 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
 958 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
 959 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
 960 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
 961 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
 962 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
 963 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
 964 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 965 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
 966 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
 967 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
 968 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 969 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 970 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
 971 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
 972 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
 973 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
 974 // CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
 975 // CHECK:   ret %struct.int16x8x2_t [[TMP13]]
 976 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 977   return vld2q_lane_s16(a, b, 7);
 978 }
 979
 980 // CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
 981 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
 982 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
 983 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
 984 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
 985 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
 986 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
 987 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
 988 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
 989 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
 990 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
 991 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 992 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
 993 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
 994 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
 995 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 996 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 997 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 998 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
 999 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1000 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1001 // CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
1002 // CHECK:   ret %struct.int32x4x2_t [[TMP13]]
1003 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
1004   return vld2q_lane_s32(a, b, 3);
1005 }
1006
1007 // CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1008 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1009 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
1010 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
1011 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
1012 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
1013 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1014 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1015 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1016 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1017 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1018 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1019 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1020 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1021 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1022 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1023 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1024 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1025 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1026 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1027 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1028 // CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
1029 // CHECK:   ret %struct.int64x2x2_t [[TMP13]]
1030 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
1031   return vld2q_lane_s64(a, b, 1);
1032 }
1033
1034 // CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
1035 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
1036 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
1037 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
1038 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
1039 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
1040 // CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1041 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1042 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1043 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1044 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1045 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1046 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1047 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1048 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1049 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1050 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1051 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1052 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a)
1053 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]]
1054 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1055 // CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16
1056 // CHECK:   ret %struct.float16x8x2_t [[TMP13]]
1057 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
1058   return vld2q_lane_f16(a, b, 7);
1059 }
1060
1061 // CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
1062 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
1063 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
1064 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
1065 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
1066 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
1067 // CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1068 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1069 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1070 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1071 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1072 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1073 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1074 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1075 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1076 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1077 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1078 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1079 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a)
1080 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]]
1081 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1082 // CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
1083 // CHECK:   ret %struct.float32x4x2_t [[TMP13]]
1084 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
1085   return vld2q_lane_f32(a, b, 3);
1086 }
1087
1088 // CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
1089 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
1090 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
1091 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
1092 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
1093 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
1094 // CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1095 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1096 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1097 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1098 // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1099 // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1100 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
1101 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1102 // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1103 // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1104 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1105 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1106 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a)
1107 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]]
1108 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1109 // CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
1110 // CHECK:   ret %struct.float64x2x2_t [[TMP13]]
1111 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
1112   return vld2q_lane_f64(a, b, 1);
1113 }
1114
1115 // CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1116 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
1117 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
1118 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
1119 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
1120 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
1121 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1122 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1123 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1124 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1125 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1126 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1127 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1128 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1129 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1130 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1131 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1132 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1133 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
1134 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1135 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1136 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
1137 // CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
1138 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
1139   return vld2q_lane_p16(a, b, 7);
1140 }
1141
1142 // CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1143 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
1144 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
1145 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
1146 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
1147 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
1148 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1149 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
1150 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1151 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1152 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1153 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1154 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
1155 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1156 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1157 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1158 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1159 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1160 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
1161 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1162 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
1163 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
1164 // CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
1165 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
1166   return vld2q_lane_p64(a, b, 1);
1167 }
1168
1169 // CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1170 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
1171 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1172 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1173 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1174 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
1175 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1176 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1177 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1178 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1179 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1180 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
1181 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1182 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1183 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1184 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1185 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1186 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
1187 // CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
1188 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
1189   return vld2_lane_u8(a, b, 7);
1190 }
1191
1192 // CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1193 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
1194 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
1195 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1196 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
1197 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
1198 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1199 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1200 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1201 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1202 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1203 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1204 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
1205 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1206 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1207 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1208 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1209 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1210 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1211 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1212 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1213 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
1214 // CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
1215 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
1216   return vld2_lane_u16(a, b, 3);
1217 }
1218
1219 // CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1220 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
1221 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
1222 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1223 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
1224 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
1225 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1226 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1227 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1228 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1229 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1230 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1231 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
1232 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1233 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1234 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1235 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1236 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1237 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1238 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1239 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1240 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
1241 // CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
1242 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
1243   return vld2_lane_u32(a, b, 1);
1244 }
1245
1246 // CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1247 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
1248 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
1249 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1250 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
1251 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
1252 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1253 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1254 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1255 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1256 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1257 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1258 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
1259 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1260 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1261 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1262 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1263 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1264 // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1265 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1266 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1267 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8
1268 // CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
1269 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
1270   return vld2_lane_u64(a, b, 0);
1271 }
1272
1273 // CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1274 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
1275 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
1276 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1277 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
1278 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
1279 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1280 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1281 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1282 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1283 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1284 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
1285 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1286 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1287 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1288 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1289 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1290 // CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
1291 // CHECK:   ret %struct.int8x8x2_t [[TMP8]]
1292 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
1293   return vld2_lane_s8(a, b, 7);
1294 }
1295
1296 // CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1297 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
1298 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
1299 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1300 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
1301 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
1302 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1303 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1304 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1305 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1306 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1307 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1308 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
1309 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1310 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1311 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1312 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1313 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1314 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1315 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1316 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1317 // CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
1318 // CHECK:   ret %struct.int16x4x2_t [[TMP13]]
1319 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
1320   return vld2_lane_s16(a, b, 3);
1321 }
1322
1323 // CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1324 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
1325 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
1326 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1327 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
1328 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
1329 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1330 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1331 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1332 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1333 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1334 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1335 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
1336 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1337 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1338 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1339 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1340 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1341 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
1342 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
1343 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1344 // CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
1345 // CHECK:   ret %struct.int32x2x2_t [[TMP13]]
1346 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
1347   return vld2_lane_s32(a, b, 1);
1348 }
1349
1350 // CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1351 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
1352 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
1353 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1354 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
1355 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
1356 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1357 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1358 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1359 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1360 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1361 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1362 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
1363 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1364 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1365 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1366 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1367 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1368 // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1369 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1370 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1371 // CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8
1372 // CHECK:   ret %struct.int64x1x2_t [[TMP13]]
1373 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
1374   return vld2_lane_s64(a, b, 0);
1375 }
1376
1377 // CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
1378 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
1379 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
1380 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1381 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
1382 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
1383 // CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1384 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1385 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1386 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
1387 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
1388 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
1389 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
1390 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
1391 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
1392 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
1393 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
1394 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
1395 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a)
1396 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]]
1397 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1398 // CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8
1399 // CHECK:   ret %struct.float16x4x2_t [[TMP13]]
1400 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
1401   return vld2_lane_f16(a, b, 3);
1402 }
1403
1404 // CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
1405 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
1406 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
1407 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1408 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
1409 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
1410 // CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1411 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1412 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1413 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
1414 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
1415 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
1416 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
1417 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
1418 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
1419 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
1420 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
1421 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
1422 // CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a)
1423 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]]
1424 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1425 // CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
1426 // CHECK:   ret %struct.float32x2x2_t [[TMP13]]
1427 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
1428   return vld2_lane_f32(a, b, 1);
1429 }
1430
1431 // CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
1432 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
1433 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
1434 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
1435 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
1436 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
1437 // CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1438 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1439 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1440 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
1441 // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
1442 // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
1443 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
1444 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
1445 // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
1446 // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
1447 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
1448 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
1449 // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a)
1450 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]]
1451 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1452 // CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
1453 // CHECK:   ret %struct.float64x1x2_t [[TMP13]]
1454 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
1455   return vld2_lane_f64(a, b, 0);
1456 }
1457
1458 // CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1459 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
1460 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
1461 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1462 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
1463 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
1464 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1465 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1466 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1467 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1468 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1469 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
1470 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1471 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1472 // CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
1473 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
1474 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1475 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
1476 // CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
1477 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
1478   return vld2_lane_p8(a, b, 7);
1479 }
1480
1481 // CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1482 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
1483 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
1484 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1485 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
1486 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
1487 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1488 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1489 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1490 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1491 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1492 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1493 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
1494 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1495 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1496 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1497 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1498 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1499 // CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
1500 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
1501 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1502 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
1503 // CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
1504 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
1505   return vld2_lane_p16(a, b, 3);
1506 }
1507
1508 // CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
1509 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
1510 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
1511 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
1512 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
1513 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
1514 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1515 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
1516 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1517 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
1518 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
1519 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1520 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
1521 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
1522 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
1523 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1524 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1525 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1526 // CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
1527 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
1528 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
1529 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
1530 // CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
1531 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
1532   return vld2_lane_p64(a, b, 0);
1533 }
1534
1535 // CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1536 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1537 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
1538 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
1539 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
1540 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
1541 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1542 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1543 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1544 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1545 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1546 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1547 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1548 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1549 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1550 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1551 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1552 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1553 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1554 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1555 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1556 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1557 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1558 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1559 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1560 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1561 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16
1562 // CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
1563 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
1564   return vld3q_lane_u16(a, b, 7);
1565 }
1566
1567 // CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1568 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1569 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
1570 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
1571 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
1572 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
1573 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1574 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1575 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1576 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1577 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1578 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1579 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1580 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1581 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1582 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1583 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1584 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1585 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1586 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1587 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1588 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1589 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1590 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1591 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1592 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1593 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16
1594 // CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
1595 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
1596   return vld3q_lane_u32(a, b, 3);
1597 }
1598
1599 // CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1600 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
1601 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
1602 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
1603 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
1604 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
1605 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1606 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1607 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1608 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1609 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1610 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1611 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1612 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1613 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1614 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1615 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1616 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1617 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1618 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1619 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1620 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1621 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1622 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1623 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1624 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1625 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
1626 // CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
1627 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
1628   return vld3q_lane_u64(a, b, 1);
1629 }
1630
1631 // CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1632 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
1633 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
1634 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
1635 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
1636 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
1637 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1638 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1639 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1640 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1641 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1642 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1643 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1644 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1645 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1646 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1647 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1648 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1649 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1650 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1651 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1652 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1653 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1654 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1655 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1656 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1657 // CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16
1658 // CHECK:   ret %struct.int16x8x3_t [[TMP16]]
1659 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
1660   return vld3q_lane_s16(a, b, 7);
1661 }
1662
1663 // CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
1664 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
1665 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
1666 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
1667 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
1668 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
1669 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1670 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1671 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1672 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
1673 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
1674 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1675 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1676 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
1677 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
1678 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1679 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1680 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
1681 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
1682 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1683 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1684 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1685 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1686 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
1687 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1688 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1689 // CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16
1690 // CHECK:   ret %struct.int32x4x3_t [[TMP16]]
1691 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
1692   return vld3q_lane_s32(a, b, 3);
1693 }
1694
1695 // CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1696 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
1697 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
1698 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
1699 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
1700 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
1701 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1702 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1703 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1704 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1705 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1706 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1707 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1708 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1709 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1710 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1711 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1712 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1713 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1714 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1715 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1716 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1717 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1718 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1719 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1720 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1721 // CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
1722 // CHECK:   ret %struct.int64x2x3_t [[TMP16]]
1723 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
1724   return vld3q_lane_s64(a, b, 1);
1725 }
1726
1727 // CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
1728 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
1729 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
1730 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
1731 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
1732 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
1733 // CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1734 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1735 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1736 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
1737 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
1738 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1739 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1740 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
1741 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
1742 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1743 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1744 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
1745 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
1746 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
1747 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1748 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1749 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
1750 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a)
1751 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]]
1752 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1753 // CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16
1754 // CHECK:   ret %struct.float16x8x3_t [[TMP16]]
1755 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
1756   return vld3q_lane_f16(a, b, 7);
1757 }
1758
1759 // CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
1760 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
1761 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
1762 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
1763 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
1764 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
1765 // CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1766 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1767 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1768 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
1769 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
1770 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1771 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1772 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
1773 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
1774 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1775 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1776 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
1777 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
1778 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
1779 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1780 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1781 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
1782 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a)
1783 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]]
1784 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1785 // CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16
1786 // CHECK:   ret %struct.float32x4x3_t [[TMP16]]
1787 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
1788   return vld3q_lane_f32(a, b, 3);
1789 }
1790
1791 // CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
1792 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
1793 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
1794 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
1795 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
1796 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
1797 // CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1798 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1799 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1800 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
1801 // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
1802 // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1803 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1804 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
1805 // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
1806 // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1807 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
1808 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
1809 // CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
1810 // CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
1811 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1812 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1813 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
1814 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a)
1815 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]]
1816 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1817 // CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
1818 // CHECK:   ret %struct.float64x2x3_t [[TMP16]]
1819 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
1820   return vld3q_lane_f64(a, b, 1);
1821 }
1822
1823 // CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
1824 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
1825 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
1826 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
1827 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
1828 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
1829 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1830 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1831 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1832 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
1833 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
1834 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1835 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
1836 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
1837 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1838 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
1839 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
1840 // CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
1841 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1842 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1843 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16
1844 // CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
1845 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
1846   return vld3q_lane_p8(a, b, 15);
1847 }
1848
1849 // CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
1850 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
1851 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
1852 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
1853 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
1854 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
1855 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1856 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1857 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1858 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
1859 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
1860 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1861 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1862 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
1863 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
1864 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1865 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1866 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
1867 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
1868 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1869 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1870 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1871 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1872 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
1873 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1874 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1875 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16
1876 // CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
1877 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
1878   return vld3q_lane_p16(a, b, 7);
1879 }
1880
1881 // CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
1882 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
1883 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
1884 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
1885 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
1886 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
1887 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1888 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
1889 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1890 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
1891 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
1892 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1893 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1894 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
1895 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
1896 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1897 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
1898 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
1899 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
1900 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
1901 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1902 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1903 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
1904 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
1905 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
1906 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
1907 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
1908 // CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
1909 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
1910   return vld3q_lane_p64(a, b, 1);
1911 }
1912
1913 // CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
1914 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
1915 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
1916 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
1917 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
1918 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
1919 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1920 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1921 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1922 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
1923 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
1924 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1925 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
1926 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
1927 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
1928 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
1929 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
1930 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
1931 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
1932 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1933 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8
1934 // CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
1935 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
1936   return vld3_lane_u8(a, b, 7);
1937 }
1938
1939 // CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a,  [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
1940 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
1941 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
1942 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
1943 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
1944 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
1945 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1946 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1947 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1948 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
1949 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
1950 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1951 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1952 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
1953 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
1954 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1955 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
1956 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
1957 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
1958 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
1959 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1960 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1961 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
1962 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
1963 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
1964 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1965 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8
1966 // CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
1967 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
1968   return vld3_lane_u16(a, b, 3);
1969 }
1970
1971 // CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
1972 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
1973 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
1974 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
1975 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
1976 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
1977 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
1978 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
1979 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1980 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
1981 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
1982 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1983 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1984 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
1985 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
1986 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1987 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
1988 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
1989 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
1990 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
1991 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1992 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1993 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
1994 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
1995 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
1996 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
1997 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8
1998 // CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
1999 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
2000   return vld3_lane_u32(a, b, 1);
2001 }
2002
2003 // CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2004 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
2005 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
2006 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
2007 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
2008 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
2009 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2010 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2011 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2012 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2013 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2014 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2015 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2016 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2017 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2018 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2019 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
2020 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2021 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2022 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2023 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2024 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2025 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2026 // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2027 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2028 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2029 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8
2030 // CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
2031 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
2032   return vld3_lane_u64(a, b, 0);
2033 }
2034
2035 // CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2036 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
2037 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
2038 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
2039 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
2040 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
2041 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2042 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2043 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2044 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2045 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2046 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2047 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2048 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2049 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
2050 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2051 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2052 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2053 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2054 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2055 // CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8
2056 // CHECK:   ret %struct.int8x8x3_t [[TMP9]]
2057 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
2058   return vld3_lane_s8(a, b, 7);
2059 }
2060
2061 // CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2062 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
2063 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
2064 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
2065 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
2066 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
2067 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2068 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2069 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2070 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2071 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2072 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2073 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2074 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2075 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2076 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2077 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
2078 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2079 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2080 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2081 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2082 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2083 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2084 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2085 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2086 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2087 // CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8
2088 // CHECK:   ret %struct.int16x4x3_t [[TMP16]]
2089 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
2090   return vld3_lane_s16(a, b, 3);
2091 }
2092
2093 // CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2094 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
2095 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
2096 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
2097 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
2098 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
2099 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2100 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2101 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2102 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2103 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2104 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2105 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2106 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2107 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2108 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2109 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
2110 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2111 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2112 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2113 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2114 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2115 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2116 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
2117 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
2118 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2119 // CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8
2120 // CHECK:   ret %struct.int32x2x3_t [[TMP16]]
2121 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
2122   return vld3_lane_s32(a, b, 1);
2123 }
2124
2125 // CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2126 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
2127 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
2128 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
2129 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
2130 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
2131 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2132 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2133 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2134 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2135 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2136 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2137 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2138 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2139 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2140 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2141 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
2142 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2143 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2144 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2145 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2146 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2147 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2148 // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2149 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2150 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2151 // CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8
2152 // CHECK:   ret %struct.int64x1x3_t [[TMP16]]
2153 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
2154   return vld3_lane_s64(a, b, 0);
2155 }
2156
2157 // CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
2158 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
2159 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
2160 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
2161 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
2162 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
2163 // CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2164 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2165 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2166 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
2167 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
2168 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
2169 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2170 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
2171 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
2172 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
2173 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
2174 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
2175 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
2176 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
2177 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
2178 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
2179 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
2180 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a)
2181 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]]
2182 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2183 // CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8
2184 // CHECK:   ret %struct.float16x4x3_t [[TMP16]]
2185 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
2186   return vld3_lane_f16(a, b, 3);
2187 }
2188
2189 // CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
2190 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
2191 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
2192 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
2193 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
2194 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
2195 // CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2196 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2197 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2198 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
2199 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
2200 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
2201 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2202 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
2203 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
2204 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
2205 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
2206 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
2207 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
2208 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
2209 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
2210 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
2211 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
2212 // CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a)
2213 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]]
2214 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2215 // CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8
2216 // CHECK:   ret %struct.float32x2x3_t [[TMP16]]
2217 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
2218   return vld3_lane_f32(a, b, 1);
2219 }
2220
2221 // CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
2222 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
2223 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
2224 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
2225 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
2226 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
2227 // CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2228 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2229 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2230 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
2231 // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
2232 // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
2233 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2234 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
2235 // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
2236 // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
2237 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
2238 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
2239 // CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
2240 // CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
2241 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
2242 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
2243 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
2244 // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a)
2245 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]]
2246 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2247 // CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
2248 // CHECK:   ret %struct.float64x1x3_t [[TMP16]]
2249 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
2250   return vld3_lane_f64(a, b, 0);
2251 }
2252
2253 // CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2254 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
2255 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
2256 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
2257 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
2258 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
2259 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2260 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2261 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2262 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2263 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2264 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2265 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2266 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2267 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
2268 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2269 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2270 // CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
2271 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
2272 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2273 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8
2274 // CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
2275 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
2276   return vld3_lane_p8(a, b, 7);
2277 }
2278
2279 // CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2280 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
2281 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
2282 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
2283 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
2284 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
2285 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2286 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2287 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2288 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2289 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2290 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2291 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2292 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2293 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2294 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2295 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
2296 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2297 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2298 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2299 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2300 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2301 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2302 // CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
2303 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
2304 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2305 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8
2306 // CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
2307 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
2308   return vld3_lane_p16(a, b, 3);
2309 }
2310
2311 // CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2312 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
2313 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
2314 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
2315 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
2316 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
2317 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2318 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
2319 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2320 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2321 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2322 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2323 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2324 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2325 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2326 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2327 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
2328 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2329 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2330 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2331 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2332 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2333 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2334 // CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
2335 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
2336 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
2337 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
2338 // CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
2339 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
2340   return vld3_lane_p64(a, b, 0);
2341 }
2342
2343 // CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2344 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
2345 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
2346 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
2347 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
2348 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
2349 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2350 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2351 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2352 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2353 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2354 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2355 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2356 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2357 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2358 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2359 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2360 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
2361 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2362 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2363 // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2364 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2365 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2366 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16
2367 // CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
2368 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
2369   return vld4q_lane_u8(a, b, 15);
2370 }
2371
2372 // CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2373 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
2374 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
2375 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
2376 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
2377 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
2378 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2379 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2380 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2381 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2382 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2383 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2384 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2385 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2386 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2387 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2388 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2389 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2390 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2391 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2392 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
2393 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2394 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2395 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2396 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2397 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2398 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2399 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2400 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2401 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2402 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2403 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16
2404 // CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
2405 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
2406   return vld4q_lane_u16(a, b, 7);
2407 }
2408
2409 // CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2410 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
2411 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
2412 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
2413 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
2414 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
2415 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2416 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2417 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2418 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2419 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2420 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2421 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2422 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2423 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2424 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2425 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2426 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2427 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2428 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2429 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
2430 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2431 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2432 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2433 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2434 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2435 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2436 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2437 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2438 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2439 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2440 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16
2441 // CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
2442 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
2443   return vld4q_lane_u32(a, b, 3);
2444 }
2445
2446 // CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2447 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
2448 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
2449 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
2450 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
2451 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
2452 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2453 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2454 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2455 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2456 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2457 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2458 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2459 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2460 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2461 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2462 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2463 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2464 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2465 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2466 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
2467 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2468 // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2469 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2470 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2471 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2472 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2473 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2474 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2475 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2476 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2477 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
2478 // CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
2479 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
2480   return vld4q_lane_u64(a, b, 1);
2481 }
2482
2483 // CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2484 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
2485 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
2486 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
2487 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
2488 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
2489 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2490 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2491 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2492 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2493 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2494 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2495 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2496 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2497 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2498 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2499 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2500 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
2501 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2502 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2503 // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2504 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2505 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2506 // CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16
2507 // CHECK:   ret %struct.int8x16x4_t [[TMP10]]
2508 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
2509   return vld4q_lane_s8(a, b, 15);
2510 }
2511
2512 // CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2513 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
2514 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
2515 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
2516 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
2517 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
2518 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2519 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2520 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2521 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2522 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2523 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2524 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2525 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2526 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2527 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2528 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2529 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2530 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2531 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2532 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
2533 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2534 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2535 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2536 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2537 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2538 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2539 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2540 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2541 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2542 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2543 // CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16
2544 // CHECK:   ret %struct.int16x8x4_t [[TMP19]]
2545 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
2546   return vld4q_lane_s16(a, b, 7);
2547 }
2548
2549 // CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
2550 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
2551 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
2552 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
2553 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
2554 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
2555 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2556 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2557 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2558 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
2559 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
2560 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2561 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2562 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
2563 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
2564 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2565 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2566 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
2567 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
2568 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2569 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
2570 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
2571 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
2572 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
2573 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2574 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2575 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2576 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
2577 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
2578 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2579 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2580 // CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16
2581 // CHECK:   ret %struct.int32x4x4_t [[TMP19]]
2582 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
2583   return vld4q_lane_s32(a, b, 3);
2584 }
2585
2586 // CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2587 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
2588 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
2589 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
2590 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
2591 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
2592 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2593 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2594 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2595 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2596 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2597 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2598 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2599 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2600 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2601 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2602 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2603 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2604 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2605 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2606 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
2607 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2608 // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2609 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2610 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2611 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2612 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2613 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2614 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2615 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2616 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2617 // CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
2618 // CHECK:   ret %struct.int64x2x4_t [[TMP19]]
2619 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
2620   return vld4q_lane_s64(a, b, 1);
2621 }
2622
2623 // CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
2624 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
2625 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
2626 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
2627 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
2628 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
2629 // CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2630 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2631 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2632 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
2633 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
2634 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
2635 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2636 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
2637 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
2638 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
2639 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2640 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
2641 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
2642 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
2643 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
2644 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
2645 // CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
2646 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
2647 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
2648 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
2649 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
2650 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
2651 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a)
2652 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]]
2653 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2654 // CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16
2655 // CHECK:   ret %struct.float16x8x4_t [[TMP19]]
2656 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
2657   return vld4q_lane_f16(a, b, 7);
2658 }
2659
2660 // CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
2661 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
2662 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
2663 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
2664 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
2665 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
2666 // CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2667 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2668 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2669 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
2670 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
2671 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
2672 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2673 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
2674 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
2675 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
2676 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2677 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
2678 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
2679 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
2680 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
2681 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
2682 // CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
2683 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
2684 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
2685 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
2686 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
2687 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
2688 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a)
2689 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]]
2690 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2691 // CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16
2692 // CHECK:   ret %struct.float32x4x4_t [[TMP19]]
2693 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
2694   return vld4q_lane_f32(a, b, 3);
2695 }
2696
2697 // CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
2698 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
2699 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
2700 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
2701 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
2702 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
2703 // CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2704 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2705 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2706 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
2707 // CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
2708 // CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
2709 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2710 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
2711 // CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
2712 // CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
2713 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2714 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
2715 // CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
2716 // CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
2717 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
2718 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
2719 // CHECK:   [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
2720 // CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
2721 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
2722 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
2723 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
2724 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
2725 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a)
2726 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]]
2727 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2728 // CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
2729 // CHECK:   ret %struct.float64x2x4_t [[TMP19]]
2730 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
2731   return vld4q_lane_f64(a, b, 1);
2732 }
2733
2734 // CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
2735 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
2736 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
2737 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
2738 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
2739 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
2740 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2741 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2742 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2743 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
2744 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
2745 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2746 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
2747 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
2748 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2749 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
2750 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
2751 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
2752 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
2753 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
2754 // CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
2755 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2756 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2757 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16
2758 // CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
2759 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
2760   return vld4q_lane_p8(a, b, 15);
2761 }
2762
2763 // CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
2764 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
2765 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
2766 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
2767 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
2768 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
2769 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2770 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2771 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2772 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
2773 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
2774 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2775 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2776 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
2777 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
2778 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2779 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2780 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
2781 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
2782 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2783 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
2784 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
2785 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
2786 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2787 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2788 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2789 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2790 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2791 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
2792 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2793 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2794 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16
2795 // CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
2796 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
2797   return vld4q_lane_p16(a, b, 7);
2798 }
2799
2800 // CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
2801 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
2802 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
2803 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
2804 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
2805 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
2806 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
2807 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
2808 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2809 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
2810 // CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
2811 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2812 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2813 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
2814 // CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
2815 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2816 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2817 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
2818 // CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
2819 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2820 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
2821 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
2822 // CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
2823 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
2824 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2825 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2826 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2827 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
2828 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
2829 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2830 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
2831 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
2832 // CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
2833 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
2834   return vld4q_lane_p64(a, b, 1);
2835 }
2836
2837 // CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2838 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
2839 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
2840 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
2841 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
2842 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
2843 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2844 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2845 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2846 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2847 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2848 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2849 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2850 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2851 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2852 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2853 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2854 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
2855 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2856 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2857 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2858 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2859 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2860 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8
2861 // CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
2862 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
2863   return vld4_lane_u8(a, b, 7);
2864 }
2865
2866 // CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
2867 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
2868 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
2869 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
2870 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
2871 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
2872 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2873 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2874 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2875 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
2876 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
2877 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2878 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2879 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
2880 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
2881 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2882 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2883 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
2884 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
2885 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2886 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
2887 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
2888 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
2889 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
2890 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2891 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2892 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2893 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
2894 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
2895 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
2896 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2897 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8
2898 // CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
2899 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
2900   return vld4_lane_u16(a, b, 3);
2901 }
2902
2903 // CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
2904 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
2905 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
2906 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
2907 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
2908 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
2909 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2910 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2911 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2912 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
2913 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
2914 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2915 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2916 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
2917 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
2918 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2919 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2920 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
2921 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
2922 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2923 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
2924 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
2925 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
2926 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
2927 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2928 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2929 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2930 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
2931 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
2932 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
2933 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2934 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8
2935 // CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
2936 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
2937   return vld4_lane_u32(a, b, 1);
2938 }
2939
2940 // CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
2941 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
2942 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
2943 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
2944 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
2945 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
2946 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2947 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2948 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2949 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
2950 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
2951 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2952 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2953 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
2954 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
2955 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2956 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2957 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
2958 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
2959 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2960 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
2961 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
2962 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
2963 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
2964 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2965 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2966 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2967 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
2968 // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
2969 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
2970 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
2971 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8
2972 // CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
2973 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
2974   return vld4_lane_u64(a, b, 0);
2975 }
2976
2977 // CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
2978 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
2979 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
2980 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
2981 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
2982 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
2983 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
2984 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
2985 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2986 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
2987 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
2988 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2989 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
2990 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
2991 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2992 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
2993 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
2994 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
2995 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
2996 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
2997 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
2998 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
2999 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3000 // CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8
3001 // CHECK:   ret %struct.int8x8x4_t [[TMP10]]
3002 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
3003   return vld4_lane_s8(a, b, 7);
3004 }
3005
3006 // CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3007 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
3008 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
3009 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
3010 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
3011 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
3012 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3013 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3014 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3015 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3016 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3017 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3018 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3019 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3020 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3021 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3022 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3023 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3024 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3025 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3026 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
3027 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3028 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3029 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3030 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3031 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3032 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3033 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3034 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3035 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3036 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3037 // CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8
3038 // CHECK:   ret %struct.int16x4x4_t [[TMP19]]
3039 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
3040   return vld4_lane_s16(a, b, 3);
3041 }
3042
3043 // CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3044 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
3045 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
3046 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
3047 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
3048 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
3049 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3050 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3051 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3052 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3053 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3054 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
3055 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3056 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3057 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3058 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
3059 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3060 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
3061 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
3062 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
3063 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
3064 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
3065 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
3066 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
3067 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
3068 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
3069 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
3070 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
3071 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
3072 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
3073 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3074 // CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8
3075 // CHECK:   ret %struct.int32x2x4_t [[TMP19]]
3076 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
3077   return vld4_lane_s32(a, b, 1);
3078 }
3079
3080 // CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3081 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
3082 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
3083 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
3084 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
3085 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
3086 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3087 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3088 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3089 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3090 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3091 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3092 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3093 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3094 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3095 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3096 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3097 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3098 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3099 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3100 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
3101 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3102 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3103 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3104 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3105 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3106 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3107 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3108 // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3109 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3110 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3111 // CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8
3112 // CHECK:   ret %struct.int64x1x4_t [[TMP19]]
3113 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
3114   return vld4_lane_s64(a, b, 0);
3115 }
3116
3117 // CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
3118 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
3119 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
3120 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
3121 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
3122 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
3123 // CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3124 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3125 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3126 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
3127 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
3128 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
3129 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3130 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
3131 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
3132 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
3133 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3134 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
3135 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
3136 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
3137 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
3138 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
3139 // CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
3140 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
3141 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
3142 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
3143 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
3144 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
3145 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a)
3146 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]]
3147 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3148 // CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8
3149 // CHECK:   ret %struct.float16x4x4_t [[TMP19]]
3150 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
3151   return vld4_lane_f16(a, b, 3);
3152 }
3153
3154 // CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
3155 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
3156 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
3157 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
3158 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
3159 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
3160 // CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3161 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3162 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3163 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
3164 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
3165 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
3166 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3167 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
3168 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
3169 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
3170 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3171 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
3172 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
3173 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
3174 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
3175 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
3176 // CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
3177 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
3178 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
3179 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
3180 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
3181 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
3182 // CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a)
3183 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]]
3184 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3185 // CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8
3186 // CHECK:   ret %struct.float32x2x4_t [[TMP19]]
3187 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
3188   return vld4_lane_f32(a, b, 1);
3189 }
3190
3191 // CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
3192 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
3193 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
3194 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
3195 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
3196 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
3197 // CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3198 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3199 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3200 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
3201 // CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
3202 // CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
3203 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3204 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
3205 // CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
3206 // CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
3207 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3208 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
3209 // CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
3210 // CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
3211 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
3212 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
3213 // CHECK:   [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
3214 // CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
3215 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
3216 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
3217 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
3218 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
3219 // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a)
3220 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]]
3221 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3222 // CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
3223 // CHECK:   ret %struct.float64x1x4_t [[TMP19]]
3224 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
3225   return vld4_lane_f64(a, b, 0);
3226 }
3227
3228 // CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3229 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
3230 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
3231 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
3232 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
3233 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
3234 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3235 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3236 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3237 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3238 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3239 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3240 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3241 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3242 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3243 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
3244 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
3245 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
3246 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
3247 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
3248 // CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
3249 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
3250 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3251 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8
3252 // CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
3253 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
3254   return vld4_lane_p8(a, b, 7);
3255 }
3256
3257 // CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3258 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
3259 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
3260 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
3261 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
3262 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
3263 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3264 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3265 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3266 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3267 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3268 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3269 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3270 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3271 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3272 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3273 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3274 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
3275 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
3276 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3277 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
3278 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
3279 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
3280 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3281 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3282 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3283 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3284 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3285 // CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
3286 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
3287 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3288 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8
3289 // CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
3290 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
3291   return vld4_lane_p16(a, b, 3);
3292 }
3293
3294 // CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3295 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
3296 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
3297 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
3298 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
3299 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
3300 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3301 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
3302 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3303 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3304 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3305 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3306 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3307 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3308 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3309 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3310 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3311 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
3312 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
3313 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3314 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
3315 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
3316 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
3317 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3318 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3319 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3320 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3321 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3322 // CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
3323 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
3324 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
3325 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
3326 // CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
3327 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
3328   return vld4_lane_p64(a, b, 0);
3329 }
3330
3331 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3332 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3333 // CHECK:   store i8 [[TMP0]], ptr %a
3334 // CHECK:   ret void
3335 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
3336   vst1q_lane_u8(a, b, 15);
3337 }
3338
3339 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3340 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3341 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3342 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3343 // CHECK:   store i16 [[TMP3]], ptr %a
3344 // CHECK:   ret void
3345 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
3346   vst1q_lane_u16(a, b, 7);
3347 }
3348
3349 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3350 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3351 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3352 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3353 // CHECK:   store i32 [[TMP3]], ptr %a
3354 // CHECK:   ret void
3355 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
3356   vst1q_lane_u32(a, b, 3);
3357 }
3358
3359 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3360 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3361 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3362 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3363 // CHECK:   store i64 [[TMP3]], ptr %a
3364 // CHECK:   ret void
3365 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
3366   vst1q_lane_u64(a, b, 1);
3367 }
3368
3369 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3370 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3371 // CHECK:   store i8 [[TMP0]], ptr %a
3372 // CHECK:   ret void
3373 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
3374   vst1q_lane_s8(a, b, 15);
3375 }
3376
3377 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3378 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3379 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3380 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3381 // CHECK:   store i16 [[TMP3]], ptr %a
3382 // CHECK:   ret void
3383 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
3384   vst1q_lane_s16(a, b, 7);
3385 }
3386
3387 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
3388 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3389 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3390 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
3391 // CHECK:   store i32 [[TMP3]], ptr %a
3392 // CHECK:   ret void
3393 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
3394   vst1q_lane_s32(a, b, 3);
3395 }
3396
3397 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3398 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3399 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3400 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3401 // CHECK:   store i64 [[TMP3]], ptr %a
3402 // CHECK:   ret void
3403 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
3404   vst1q_lane_s64(a, b, 1);
3405 }
3406
3407 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
3408 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
3409 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
3410 // CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
3411 // CHECK:   store half [[TMP3]], ptr %a
3412 // CHECK:   ret void
3413 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
3414   vst1q_lane_f16(a, b, 7);
3415 }
3416
3417 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
3418 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3419 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3420 // CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
3421 // CHECK:   store float [[TMP3]], ptr %a
3422 // CHECK:   ret void
3423 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
3424   vst1q_lane_f32(a, b, 3);
3425 }
3426
3427 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
3428 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
3429 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3430 // CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
3431 // CHECK:   store double [[TMP3]], ptr %a
3432 // CHECK:   ret void
3433 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
3434   vst1q_lane_f64(a, b, 1);
3435 }
3436
3437 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
3438 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
3439 // CHECK:   store i8 [[TMP0]], ptr %a
3440 // CHECK:   ret void
3441 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
3442   vst1q_lane_p8(a, b, 15);
3443 }
3444
3445 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
3446 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3447 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3448 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
3449 // CHECK:   store i16 [[TMP3]], ptr %a
3450 // CHECK:   ret void
3451 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
3452   vst1q_lane_p16(a, b, 7);
3453 }
3454
3455 // CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
3456 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3457 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3458 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
3459 // CHECK:   store i64 [[TMP3]], ptr %a
3460 // CHECK:   ret void
3461 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
3462   vst1q_lane_p64(a, b, 1);
3463 }
3464
3465 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3466 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3467 // CHECK:   store i8 [[TMP0]], ptr %a
3468 // CHECK:   ret void
3469 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
3470   vst1_lane_u8(a, b, 7);
3471 }
3472
3473 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3474 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3475 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3476 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3477 // CHECK:   store i16 [[TMP3]], ptr %a
3478 // CHECK:   ret void
3479 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
3480   vst1_lane_u16(a, b, 3);
3481 }
3482
3483 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3484 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3485 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3486 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3487 // CHECK:   store i32 [[TMP3]], ptr %a
3488 // CHECK:   ret void
3489 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
3490   vst1_lane_u32(a, b, 1);
3491 }
3492
3493 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3494 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3495 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3496 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3497 // CHECK:   store i64 [[TMP3]], ptr %a
3498 // CHECK:   ret void
3499 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
3500   vst1_lane_u64(a, b, 0);
3501 }
3502
3503 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3504 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3505 // CHECK:   store i8 [[TMP0]], ptr %a
3506 // CHECK:   ret void
3507 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
3508   vst1_lane_s8(a, b, 7);
3509 }
3510
3511 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3512 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3513 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3514 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3515 // CHECK:   store i16 [[TMP3]], ptr %a
3516 // CHECK:   ret void
3517 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
3518   vst1_lane_s16(a, b, 3);
3519 }
3520
3521 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
3522 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3523 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3524 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
3525 // CHECK:   store i32 [[TMP3]], ptr %a
3526 // CHECK:   ret void
3527 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
3528   vst1_lane_s32(a, b, 1);
3529 }
3530
3531 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3532 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3533 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3534 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3535 // CHECK:   store i64 [[TMP3]], ptr %a
3536 // CHECK:   ret void
3537 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
3538   vst1_lane_s64(a, b, 0);
3539 }
3540
3541 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
3542 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
3543 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
3544 // CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
3545 // CHECK:   store half [[TMP3]], ptr %a
3546 // CHECK:   ret void
3547 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
3548   vst1_lane_f16(a, b, 3);
3549 }
3550
3551 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
3552 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3553 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3554 // CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
3555 // CHECK:   store float [[TMP3]], ptr %a
3556 // CHECK:   ret void
3557 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
3558   vst1_lane_f32(a, b, 1);
3559 }
3560
3561 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
3562 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3563 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
3564 // CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
3565 // CHECK:   store double [[TMP3]], ptr %a
3566 // CHECK:   ret void
3567 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
3568   vst1_lane_f64(a, b, 0);
3569 }
3570
3571 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
3572 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
3573 // CHECK:   store i8 [[TMP0]], ptr %a
3574 // CHECK:   ret void
3575 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
3576   vst1_lane_p8(a, b, 7);
3577 }
3578
3579 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
3580 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3581 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3582 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
3583 // CHECK:   store i16 [[TMP3]], ptr %a
3584 // CHECK:   ret void
3585 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
3586   vst1_lane_p16(a, b, 3);
3587 }
3588
3589 // CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
3590 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3591 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3592 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
3593 // CHECK:   store i64 [[TMP3]], ptr %a
3594 // CHECK:   ret void
3595 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
3596   vst1_lane_p64(a, b, 0);
3597 }
3598
3599 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3600 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
3601 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
3602 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
3603 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3604 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3605 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3606 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3607 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3608 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
3609 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3610 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3611 // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3612 // CHECK:   ret void
3613 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
3614   vst2q_lane_u8(a, b, 15);
3615 }
3616
3617 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3618 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
3619 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
3620 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
3621 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3622 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3623 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3624 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3625 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3626 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3627 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
3628 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3629 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3630 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3631 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3632 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3633 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3634 // CHECK:   ret void
3635 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
3636   vst2q_lane_u16(a, b, 7);
3637 }
3638
3639 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3640 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
3641 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
3642 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
3643 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3644 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3645 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3646 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3647 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3648 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3649 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
3650 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3651 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3652 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3653 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3654 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3655 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3656 // CHECK:   ret void
3657 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
3658   vst2q_lane_u32(a, b, 3);
3659 }
3660
3661 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3662 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
3663 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
3664 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
3665 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3666 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3667 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3668 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3669 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3670 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3671 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
3672 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3673 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3674 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3675 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3676 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3677 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3678 // CHECK:   ret void
3679 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
3680   vst2q_lane_u64(a, b, 1);
3681 }
3682
3683 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3684 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
3685 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
3686 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
3687 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3688 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3689 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3690 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3691 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3692 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
3693 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3694 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3695 // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3696 // CHECK:   ret void
3697 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
3698   vst2q_lane_s8(a, b, 15);
3699 }
3700
3701 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3702 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
3703 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
3704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
3705 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3706 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3707 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3708 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3709 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3710 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3711 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
3712 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3713 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3714 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3715 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3716 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3717 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3718 // CHECK:   ret void
3719 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
3720   vst2q_lane_s16(a, b, 7);
3721 }
3722
3723 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
3724 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
3725 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
3726 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
3727 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3728 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3729 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3730 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
3731 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
3732 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
3733 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
3734 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
3735 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
3736 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
3737 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
3738 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
3739 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
3740 // CHECK:   ret void
3741 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
3742   vst2q_lane_s32(a, b, 3);
3743 }
3744
3745 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3746 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
3747 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
3748 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
3749 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3750 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3751 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3752 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3753 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3754 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3755 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
3756 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3757 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3758 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3759 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3760 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3761 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3762 // CHECK:   ret void
3763 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
3764   vst2q_lane_s64(a, b, 1);
3765 }
3766
3767 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
3768 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
3769 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
3770 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
3771 // CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3772 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3773 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3774 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
3775 // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
3776 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
3777 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
3778 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
3779 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
3780 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
3781 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
3782 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
3783 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a)
3784 // CHECK:   ret void
3785 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
3786   vst2q_lane_f16(a, b, 7);
3787 }
3788
3789 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
3790 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
3791 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
3792 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
3793 // CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3794 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3795 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3796 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
3797 // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
3798 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
3799 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
3800 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
3801 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
3802 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
3803 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
3804 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
3805 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a)
3806 // CHECK:   ret void
3807 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
3808   vst2q_lane_f32(a, b, 3);
3809 }
3810
3811 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
3812 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
3813 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
3814 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
3815 // CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3816 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3817 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3818 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
3819 // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
3820 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
3821 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
3822 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
3823 // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
3824 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
3825 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
3826 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
3827 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a)
3828 // CHECK:   ret void
3829 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
3830   vst2q_lane_f64(a, b, 1);
3831 }
3832
3833 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
3834 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
3835 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
3836 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
3837 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3838 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3839 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3840 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
3841 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
3842 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
3843 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
3844 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
3845 // CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
3846 // CHECK:   ret void
3847 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
3848   vst2q_lane_p8(a, b, 15);
3849 }
3850
3851 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
3852 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
3853 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
3854 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
3855 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3856 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3857 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3858 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
3859 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
3860 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
3861 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
3862 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
3863 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
3864 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
3865 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
3866 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
3867 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
3868 // CHECK:   ret void
3869 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
3870   vst2q_lane_p16(a, b, 7);
3871 }
3872
3873 // CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
3874 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
3875 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
3876 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
3877 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
3878 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
3879 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3880 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
3881 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
3882 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
3883 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
3884 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
3885 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
3886 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
3887 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
3888 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
3889 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
3890 // CHECK:   ret void
3891 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
3892   vst2q_lane_p64(a, b, 1);
3893 }
3894
3895 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3896 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
3897 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
3898 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
3899 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3900 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3901 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3902 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3903 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3904 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
3905 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3906 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3907 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3908 // CHECK:   ret void
3909 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
3910   vst2_lane_u8(a, b, 7);
3911 }
3912
3913 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3914 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
3915 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
3916 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
3917 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3918 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3919 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3920 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
3921 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
3922 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
3923 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
3924 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
3925 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
3926 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
3927 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
3928 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
3929 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
3930 // CHECK:   ret void
3931 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
3932   vst2_lane_u16(a, b, 3);
3933 }
3934
3935 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
3936 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
3937 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
3938 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
3939 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3940 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3941 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3942 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
3943 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
3944 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
3945 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
3946 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
3947 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
3948 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
3949 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
3950 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
3951 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
3952 // CHECK:   ret void
3953 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
3954   vst2_lane_u32(a, b, 1);
3955 }
3956
3957 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
3958 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
3959 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
3960 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
3961 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3962 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3963 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3964 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
3965 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
3966 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
3967 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
3968 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
3969 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
3970 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
3971 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
3972 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
3973 // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
3974 // CHECK:   ret void
3975 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
3976   vst2_lane_u64(a, b, 0);
3977 }
3978
3979 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
3980 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
3981 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
3982 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
3983 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
3984 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
3985 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3986 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
3987 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
3988 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
3989 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
3990 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
3991 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
3992 // CHECK:   ret void
3993 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
3994   vst2_lane_s8(a, b, 7);
3995 }
3996
3997 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
3998 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
3999 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
4000 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
4001 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4002 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4003 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4004 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4005 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4006 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4007 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4008 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4009 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4010 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4011 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4012 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4013 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4014 // CHECK:   ret void
4015 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
4016   vst2_lane_s16(a, b, 3);
4017 }
4018
4019 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4020 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
4021 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
4022 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
4023 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4024 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4025 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4026 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4027 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4028 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4029 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4030 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4031 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4032 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4033 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4034 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4035 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
4036 // CHECK:   ret void
4037 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
4038   vst2_lane_s32(a, b, 1);
4039 }
4040
4041 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4042 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
4043 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
4044 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
4045 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4046 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4047 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4048 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4049 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4050 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4051 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
4052 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4053 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4054 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4055 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4056 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4057 // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4058 // CHECK:   ret void
4059 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
4060   vst2_lane_s64(a, b, 0);
4061 }
4062
4063 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
4064 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
4065 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
4066 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
4067 // CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4068 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4069 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4070 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4071 // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4072 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4073 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4074 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4075 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4076 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4077 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4078 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4079 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a)
4080 // CHECK:   ret void
4081 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
4082   vst2_lane_f16(a, b, 3);
4083 }
4084
4085 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
4086 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
4087 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
4088 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
4089 // CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4090 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4091 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4092 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4093 // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4094 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4095 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4096 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4097 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4098 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4099 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4100 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4101 // CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a)
4102 // CHECK:   ret void
4103 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
4104   vst2_lane_f32(a, b, 1);
4105 }
4106
4107 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
4108 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
4109 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
4110 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
4111 // CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4112 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4113 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4114 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4115 // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4116 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4117 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
4118 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4119 // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4120 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4121 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4122 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4123 // CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a)
4124 // CHECK:   ret void
4125 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
4126   vst2_lane_f64(a, b, 0);
4127 }
4128
4129 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4130 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
4131 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
4132 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
4133 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4134 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4135 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4136 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4137 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4138 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4139 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4140 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4141 // CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
4142 // CHECK:   ret void
4143 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
4144   vst2_lane_p8(a, b, 7);
4145 }
4146
4147 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4148 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
4149 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
4150 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
4151 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4152 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4153 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4154 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4155 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4156 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4157 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
4158 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4159 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4160 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4161 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4162 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4163 // CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
4164 // CHECK:   ret void
4165 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
4166   vst2_lane_p16(a, b, 3);
4167 }
4168
4169 // CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4170 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
4171 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
4172 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
4173 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4174 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
4175 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4176 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4177 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4178 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4179 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
4180 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4181 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4182 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4183 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4184 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4185 // CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
4186 // CHECK:   ret void
4187 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
4188   vst2_lane_p64(a, b, 0);
4189 }
4190
4191 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4192 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
4193 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
4194 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
4195 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4196 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4197 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4198 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4199 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4200 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4201 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4202 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4203 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
4204 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4205 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4206 // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4207 // CHECK:   ret void
4208 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
4209   vst3q_lane_u8(a, b, 15);
4210 }
4211
4212 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4213 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
4214 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
4215 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
4216 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4217 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4218 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4219 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4220 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4221 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4222 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4223 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4224 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4225 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4226 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
4227 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4228 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4229 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4230 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4231 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4232 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4233 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4234 // CHECK:   ret void
4235 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
4236   vst3q_lane_u16(a, b, 7);
4237 }
4238
4239 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4240 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
4241 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
4242 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
4243 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4244 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4245 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4246 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4247 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4248 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4249 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4250 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4251 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4252 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4253 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
4254 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4255 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4256 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4257 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4258 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4259 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4260 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4261 // CHECK:   ret void
4262 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
4263   vst3q_lane_u32(a, b, 3);
4264 }
4265
4266 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4267 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
4268 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
4269 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
4270 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4271 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4272 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4273 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4274 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4275 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4276 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4277 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4278 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4279 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4280 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
4281 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4282 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4283 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4284 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4285 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4286 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4287 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4288 // CHECK:   ret void
4289 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
4290   vst3q_lane_u64(a, b, 1);
4291 }
4292
4293 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4294 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
4295 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
4296 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
4297 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4298 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4299 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4300 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4301 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4302 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4303 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4304 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4305 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
4306 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4307 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4308 // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4309 // CHECK:   ret void
4310 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
4311   vst3q_lane_s8(a, b, 15);
4312 }
4313
4314 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4315 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
4316 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
4317 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
4318 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4319 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4320 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4321 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4322 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4323 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4324 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4325 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4326 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4327 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4328 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
4329 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4330 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4331 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4332 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4333 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4334 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4335 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4336 // CHECK:   ret void
4337 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
4338   vst3q_lane_s16(a, b, 7);
4339 }
4340
4341 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4342 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
4343 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
4344 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
4345 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4346 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4347 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4348 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4349 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4350 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4351 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4352 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4353 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4354 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4355 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
4356 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4357 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4358 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4359 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4360 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4361 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4362 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
4363 // CHECK:   ret void
4364 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
4365   vst3q_lane_s32(a, b, 3);
4366 }
4367
4368 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4369 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
4370 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
4371 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
4372 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4373 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4374 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4375 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4376 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4377 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4378 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4379 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4380 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4381 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4382 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
4383 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4384 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4385 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4386 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4387 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4388 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4389 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4390 // CHECK:   ret void
4391 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
4392   vst3q_lane_s64(a, b, 1);
4393 }
4394
4395 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
4396 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
4397 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
4398 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
4399 // CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4400 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4401 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4402 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
4403 // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
4404 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
4405 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4406 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
4407 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
4408 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4409 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
4410 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
4411 // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
4412 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
4413 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
4414 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4415 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
4416 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a)
4417 // CHECK:   ret void
4418 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
4419   vst3q_lane_f16(a, b, 7);
4420 }
4421
4422 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
4423 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
4424 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
4425 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
4426 // CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4427 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4428 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4429 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
4430 // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
4431 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
4432 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4433 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
4434 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
4435 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4436 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
4437 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
4438 // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
4439 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
4440 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
4441 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4442 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
4443 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a)
4444 // CHECK:   ret void
4445 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
4446   vst3q_lane_f32(a, b, 3);
4447 }
4448
4449 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
4450 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
4451 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
4452 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
4453 // CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4454 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4455 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4456 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
4457 // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
4458 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
4459 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4460 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
4461 // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
4462 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
4463 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
4464 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
4465 // CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
4466 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
4467 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
4468 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
4469 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
4470 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a)
4471 // CHECK:   ret void
4472 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
4473   vst3q_lane_f64(a, b, 1);
4474 }
4475
4476 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4477 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
4478 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
4479 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
4480 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4481 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4482 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4483 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4484 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4485 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4486 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4487 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4488 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
4489 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4490 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4491 // CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
4492 // CHECK:   ret void
4493 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
4494   vst3q_lane_p8(a, b, 15);
4495 }
4496
4497 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4498 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
4499 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
4500 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
4501 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4502 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4503 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4504 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4505 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4506 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4507 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4508 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4509 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4510 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4511 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
4512 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4513 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4514 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4515 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4516 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4517 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4518 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
4519 // CHECK:   ret void
4520 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
4521   vst3q_lane_p16(a, b, 7);
4522 }
4523
4524 // CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
4525 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
4526 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
4527 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
4528 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4529 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
4530 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4531 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
4532 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
4533 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4534 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4535 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
4536 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
4537 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4538 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
4539 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
4540 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
4541 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
4542 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4543 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4544 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4545 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
4546 // CHECK:   ret void
4547 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
4548   vst3q_lane_p64(a, b, 1);
4549 }
4550
4551 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4552 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
4553 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
4554 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
4555 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4556 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4557 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4558 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4559 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4560 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4561 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4562 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4563 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
4564 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4565 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4566 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4567 // CHECK:   ret void
4568 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
4569   vst3_lane_u8(a, b, 7);
4570 }
4571
4572 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4573 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
4574 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
4575 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
4576 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4577 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4578 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4579 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4580 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4581 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4582 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4583 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4584 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4585 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4586 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
4587 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4588 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4589 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4590 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4591 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4592 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4593 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4594 // CHECK:   ret void
4595 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
4596   vst3_lane_u16(a, b, 3);
4597 }
4598
4599 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4600 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
4601 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
4602 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
4603 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4604 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4605 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4606 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4607 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4608 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4609 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4610 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4611 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4612 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4613 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
4614 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4615 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4616 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4617 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4618 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4619 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4620 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4621 // CHECK:   ret void
4622 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
4623   vst3_lane_u32(a, b, 1);
4624 }
4625
4626 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4627 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
4628 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
4629 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
4630 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4631 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4632 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4633 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4634 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4635 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4636 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4637 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4638 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4639 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4640 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
4641 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4642 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4643 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4644 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4645 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4646 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4647 // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4648 // CHECK:   ret void
4649 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
4650   vst3_lane_u64(a, b, 0);
4651 }
4652
4653 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4654 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
4655 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
4656 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
4657 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4658 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4659 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4660 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4661 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4662 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4663 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4664 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4665 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
4666 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4667 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4668 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4669 // CHECK:   ret void
4670 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
4671   vst3_lane_s8(a, b, 7);
4672 }
4673
4674 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4675 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
4676 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
4677 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
4678 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4679 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4680 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4681 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4682 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4683 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4684 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4685 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4686 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4687 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4688 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
4689 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4690 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4691 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4692 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4693 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4694 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4695 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4696 // CHECK:   ret void
4697 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
4698   vst3_lane_s16(a, b, 3);
4699 }
4700
4701 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
4702 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
4703 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
4704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
4705 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4706 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4707 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4708 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
4709 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4710 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4711 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4712 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
4713 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4714 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4715 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
4716 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
4717 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
4718 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4719 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4720 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4721 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4722 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
4723 // CHECK:   ret void
4724 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
4725   vst3_lane_s32(a, b, 1);
4726 }
4727
4728 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4729 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
4730 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
4731 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
4732 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4733 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4734 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4735 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4736 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4737 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4738 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4739 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4740 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4741 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4742 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
4743 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4744 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4745 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4746 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4747 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4748 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4749 // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4750 // CHECK:   ret void
4751 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
4752   vst3_lane_s64(a, b, 0);
4753 }
4754
4755 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
4756 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
4757 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
4758 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
4759 // CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4760 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4761 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4762 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
4763 // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4764 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4765 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4766 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
4767 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4768 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4769 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
4770 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
4771 // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
4772 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
4773 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4774 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4775 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
4776 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a)
4777 // CHECK:   ret void
4778 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
4779   vst3_lane_f16(a, b, 3);
4780 }
4781
4782 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
4783 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
4784 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
4785 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
4786 // CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4787 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4788 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4789 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
4790 // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4791 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4792 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4793 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
4794 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4795 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4796 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
4797 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
4798 // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
4799 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
4800 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4801 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4802 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
4803 // CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a)
4804 // CHECK:   ret void
4805 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
4806   vst3_lane_f32(a, b, 1);
4807 }
4808
4809 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
4810 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
4811 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
4812 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
4813 // CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4814 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4815 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4816 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
4817 // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
4818 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4819 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4820 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
4821 // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
4822 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4823 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
4824 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
4825 // CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
4826 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
4827 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
4828 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
4829 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
4830 // CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a)
4831 // CHECK:   ret void
4832 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
4833   vst3_lane_f64(a, b, 0);
4834 }
4835
4836 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
4837 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
4838 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
4839 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
4840 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4841 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4842 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4843 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
4844 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4845 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4846 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
4847 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4848 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
4849 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
4850 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
4851 // CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
4852 // CHECK:   ret void
4853 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
4854   vst3_lane_p8(a, b, 7);
4855 }
4856
4857 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
4858 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
4859 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
4860 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
4861 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4862 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4863 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4864 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
4865 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4866 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4867 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4868 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
4869 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4870 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4871 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
4872 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
4873 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
4874 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4875 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4876 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4877 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4878 // CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
4879 // CHECK:   ret void
4880 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
4881   vst3_lane_p16(a, b, 3);
4882 }
4883
4884 // CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
4885 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
4886 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
4887 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
4888 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4889 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
4890 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4891 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
4892 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
4893 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4894 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4895 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
4896 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
4897 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4898 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
4899 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
4900 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
4901 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
4902 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4903 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4904 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
4905 // CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
4906 // CHECK:   ret void
4907 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
4908   vst3_lane_p64(a, b, 0);
4909 }
4910
4911 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
4912 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
4913 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
4914 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
4915 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4916 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4917 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4918 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
4919 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
4920 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4921 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
4922 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
4923 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4924 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
4925 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
4926 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
4927 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
4928 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
4929 // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
4930 // CHECK:   ret void
4931 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
4932   vst4q_lane_u8(a, b, 15);
4933 }
4934
4935 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
4936 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
4937 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
4938 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
4939 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4940 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4941 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4942 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
4943 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4944 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4945 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4946 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
4947 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4948 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4949 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4950 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
4951 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
4952 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4953 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
4954 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
4955 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
4956 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
4957 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4958 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4959 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4960 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
4961 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
4962 // CHECK:   ret void
4963 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
4964   vst4q_lane_u16(a, b, 7);
4965 }
4966
4967 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
4968 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
4969 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
4970 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
4971 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4972 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
4973 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4974 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
4975 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4976 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4977 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4978 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
4979 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4980 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4981 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4982 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
4983 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
4984 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4985 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
4986 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
4987 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
4988 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
4989 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4990 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4991 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4992 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
4993 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
4994 // CHECK:   ret void
4995 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
4996   vst4q_lane_u32(a, b, 3);
4997 }
4998
4999 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5000 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
5001 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
5002 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
5003 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5004 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5005 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5006 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5007 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5008 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5009 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5010 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5011 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5012 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5013 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5014 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5015 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5016 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5017 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
5018 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5019 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5020 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5021 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5022 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5023 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5024 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5025 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5026 // CHECK:   ret void
5027 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
5028   vst4q_lane_u64(a, b, 1);
5029 }
5030
5031 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5032 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
5033 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
5034 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
5035 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5036 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5037 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5038 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5039 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5040 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5041 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5042 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5043 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5044 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5045 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5046 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
5047 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5048 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5049 // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5050 // CHECK:   ret void
5051 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
5052   vst4q_lane_s8(a, b, 15);
5053 }
5054
5055 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5056 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
5057 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
5058 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
5059 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5060 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5061 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5062 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5063 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5064 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5065 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5066 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5067 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5068 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5069 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5070 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5071 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5072 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5073 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5074 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5075 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5076 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5077 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5078 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5079 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5080 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5081 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5082 // CHECK:   ret void
5083 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
5084   vst4q_lane_s16(a, b, 7);
5085 }
5086
5087 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
5088 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
5089 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
5090 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
5091 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5092 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5093 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5094 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
5095 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5096 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5097 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5098 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
5099 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5100 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5101 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5102 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
5103 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5104 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5105 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5106 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
5107 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
5108 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5109 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5110 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5111 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5112 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5113 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
5114 // CHECK:   ret void
5115 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
5116   vst4q_lane_s32(a, b, 3);
5117 }
5118
5119 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5120 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
5121 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
5122 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
5123 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5124 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5125 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5126 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5127 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5128 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5129 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5130 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5131 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5132 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5133 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5134 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5135 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5136 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5137 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
5138 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5139 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5140 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5141 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5142 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5143 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5144 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5145 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5146 // CHECK:   ret void
5147 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
5148   vst4q_lane_s64(a, b, 1);
5149 }
5150
5151 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
5152 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
5153 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
5154 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
5155 // CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5156 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5157 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5158 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
5159 // CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
5160 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
5161 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5162 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
5163 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
5164 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5165 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5166 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
5167 // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
5168 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5169 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5170 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
5171 // CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
5172 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5173 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
5174 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5175 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5176 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5177 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a)
5178 // CHECK:   ret void
5179 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
5180   vst4q_lane_f16(a, b, 7);
5181 }
5182
5183 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
5184 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
5185 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
5186 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
5187 // CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5188 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5189 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5190 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
5191 // CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
5192 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
5193 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5194 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
5195 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
5196 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5197 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5198 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
5199 // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
5200 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5201 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5202 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
5203 // CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
5204 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5205 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
5206 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5207 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5208 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5209 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a)
5210 // CHECK:   ret void
5211 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
5212   vst4q_lane_f32(a, b, 3);
5213 }
5214
5215 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
5216 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
5217 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
5218 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
5219 // CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5220 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5221 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5222 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
5223 // CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
5224 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
5225 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5226 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
5227 // CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
5228 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
5229 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5230 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
5231 // CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
5232 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
5233 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
5234 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
5235 // CHECK:   [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
5236 // CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
5237 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
5238 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
5239 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
5240 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
5241 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a)
5242 // CHECK:   ret void
5243 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
5244   vst4q_lane_f64(a, b, 1);
5245 }
5246
5247 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
5248 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
5249 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
5250 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
5251 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5252 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5253 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5254 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
5255 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
5256 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5257 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
5258 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
5259 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5260 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
5261 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
5262 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
5263 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
5264 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
5265 // CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
5266 // CHECK:   ret void
5267 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
5268   vst4q_lane_p8(a, b, 15);
5269 }
5270
5271 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
5272 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
5273 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
5274 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
5275 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5276 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5277 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5278 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
5279 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5280 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5281 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5282 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
5283 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5284 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5285 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5286 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
5287 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5288 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5289 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5290 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
5291 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5292 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5293 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5294 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5295 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5296 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5297 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
5298 // CHECK:   ret void
5299 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
5300   vst4q_lane_p16(a, b, 7);
5301 }
5302
5303 // CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
5304 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
5305 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
5306 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
5307 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5308 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
5309 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5310 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
5311 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
5312 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5313 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5314 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
5315 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
5316 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5317 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5318 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
5319 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
5320 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5321 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
5322 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
5323 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
5324 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5325 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5326 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5327 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5328 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
5329 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
5330 // CHECK:   ret void
5331 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
5332   vst4q_lane_p64(a, b, 1);
5333 }
5334
5335 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5336 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
5337 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
5338 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
5339 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5340 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5341 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5342 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5343 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5344 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5345 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5346 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5347 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5348 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5349 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5350 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5351 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5352 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5353 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5354 // CHECK:   ret void
5355 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
5356   vst4_lane_u8(a, b, 7);
5357 }
5358
5359 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5360 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
5361 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
5362 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
5363 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5364 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5365 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5366 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5367 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5368 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5369 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5370 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5371 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5372 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5373 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5374 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5375 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5376 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5377 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
5378 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5379 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5380 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5381 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5382 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5383 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5384 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5385 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5386 // CHECK:   ret void
5387 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
5388   vst4_lane_u16(a, b, 3);
5389 }
5390
5391 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5392 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
5393 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
5394 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
5395 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5396 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5397 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5398 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5399 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5400 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5401 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5402 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5403 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5404 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5405 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5406 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5407 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5408 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5409 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
5410 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5411 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5412 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5413 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5414 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5415 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5416 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5417 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5418 // CHECK:   ret void
5419 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
5420   vst4_lane_u32(a, b, 1);
5421 }
5422
5423 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5424 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
5425 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
5426 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
5427 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5428 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5429 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5430 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5431 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5432 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5433 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5434 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5435 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5436 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5437 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5438 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5439 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5440 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5441 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
5442 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5443 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5444 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5445 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5446 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5447 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5448 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5449 // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5450 // CHECK:   ret void
5451 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
5452   vst4_lane_u64(a, b, 0);
5453 }
5454
5455 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5456 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
5457 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
5458 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
5459 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5460 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5461 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5462 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5463 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5464 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5465 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5466 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5467 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5468 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5469 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5470 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
5471 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5472 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5473 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5474 // CHECK:   ret void
5475 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
5476   vst4_lane_s8(a, b, 7);
5477 }
5478
5479 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5480 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
5481 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
5482 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
5483 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5484 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5485 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5486 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5487 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5488 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5489 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5490 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5491 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5492 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5493 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5494 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5495 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5496 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5497 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
5498 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5499 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5500 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5501 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5502 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5503 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5504 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5505 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5506 // CHECK:   ret void
5507 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
5508   vst4_lane_s16(a, b, 3);
5509 }
5510
5511 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
5512 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
5513 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
5514 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
5515 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5516 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5517 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5518 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
5519 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5520 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5521 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5522 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
5523 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5524 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5525 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5526 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
5527 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5528 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5529 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
5530 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
5531 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
5532 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5533 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5534 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5535 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5536 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5537 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
5538 // CHECK:   ret void
5539 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
5540   vst4_lane_s32(a, b, 1);
5541 }
5542
5543 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5544 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
5545 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
5546 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
5547 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5548 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5549 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5550 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5551 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5552 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5553 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5554 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5555 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5556 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5557 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5558 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5559 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5560 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5561 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
5562 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5563 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5564 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5565 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5566 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5567 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5568 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5569 // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5570 // CHECK:   ret void
5571 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
5572   vst4_lane_s64(a, b, 0);
5573 }
5574
5575 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
5576 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
5577 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
5578 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
5579 // CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5580 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5581 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5582 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
5583 // CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
5584 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
5585 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5586 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
5587 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
5588 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5589 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5590 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
5591 // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
5592 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5593 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
5594 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
5595 // CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
5596 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
5597 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
5598 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5599 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5600 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
5601 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a)
5602 // CHECK:   ret void
5603 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
5604   vst4_lane_f16(a, b, 3);
5605 }
5606
5607 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
5608 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
5609 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
5610 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
5611 // CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5612 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5613 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5614 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
5615 // CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
5616 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
5617 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5618 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
5619 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
5620 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5621 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5622 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
5623 // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
5624 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5625 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
5626 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
5627 // CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
5628 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
5629 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
5630 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5631 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5632 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
5633 // CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a)
5634 // CHECK:   ret void
5635 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
5636   vst4_lane_f32(a, b, 1);
5637 }
5638
5639 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
5640 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
5641 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
5642 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
5643 // CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5644 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5645 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5646 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
5647 // CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
5648 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
5649 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5650 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
5651 // CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
5652 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
5653 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5654 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
5655 // CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
5656 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
5657 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
5658 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
5659 // CHECK:   [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
5660 // CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
5661 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
5662 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
5663 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
5664 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
5665 // CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a)
5666 // CHECK:   ret void
5667 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
5668   vst4_lane_f64(a, b, 0);
5669 }
5670
5671 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
5672 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
5673 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
5674 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
5675 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5676 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5677 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5678 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
5679 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5680 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5681 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
5682 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5683 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5684 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
5685 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5686 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
5687 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
5688 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
5689 // CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
5690 // CHECK:   ret void
5691 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
5692   vst4_lane_p8(a, b, 7);
5693 }
5694
5695 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
5696 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
5697 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
5698 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
5699 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5700 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5701 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5702 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
5703 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5704 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5705 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5706 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
5707 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5708 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5709 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5710 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
5711 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5712 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5713 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
5714 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
5715 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
5716 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5717 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5718 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5719 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5720 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5721 // CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
5722 // CHECK:   ret void
5723 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
5724   vst4_lane_p16(a, b, 3);
5725 }
5726
5727 // CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
5728 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
5729 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
5730 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
5731 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5732 // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
5733 // CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5734 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
5735 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
5736 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5737 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5738 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
5739 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
5740 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5741 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5742 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
5743 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
5744 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5745 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
5746 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
5747 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
5748 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
5749 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5750 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5751 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5752 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
5753 // CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
5754 // CHECK:   ret void
5755 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
5756   vst4_lane_p64(a, b, 0);
5757 }