clang/test/CodeGen/arm-neon-vst.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
   2 // RUN:     -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \
   3 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
   4 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
   5 // RUN:     -target-feature +fp16 -disable-O0-optnone -emit-llvm -o - %s | \
   6 // RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
   7
   8 // REQUIRES: aarch64-registered-target || arm-registered-target
   9
  10 #include <arm_neon.h>
  11
  12 // CHECK-LABEL: @test_vst1_f16_x2(
  13 // CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
  14 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
  15 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
  16 // CHECK-A64: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
  17 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
  18 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
  19 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
  20 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
  21 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
  22 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
  23 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
  24 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
  25 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
  26 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
  27 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF:(half|i16)]]>
  28 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
  29 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr %a)
  30 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
  31 // CHECK: ret void
  32 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
  33   vst1_f16_x2(a, b);
  34 }
  35
  36 // CHECK-LABEL: @test_vst1_f16_x3(
  37 // CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
  38 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
  39 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
  40 // CHECK-A64: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
  41 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
  42 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
  43 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
  44 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
  45 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
  46 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
  47 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
  48 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
  49 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
  50 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
  51 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
  52 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
  53 // CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
  54 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
  55 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
  56 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
  57 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
  58 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr %a)
  59 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
  60 // CHECK: ret void
  61 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
  62   vst1_f16_x3(a, b);
  63 }
  64
  65 // CHECK-LABEL: @test_vst1_f16_x4(
  66 // CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
  67 // CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
  68 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
  69 // CHECK-A64: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
  70 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
  71 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
  72 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
  73 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
  74 // CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
  75 // CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
  76 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
  77 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
  78 // CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
  79 // CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
  80 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
  81 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
  82 // CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
  83 // CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
  84 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
  85 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
  86 // CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
  87 // CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
  88 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
  89 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
  90 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
  91 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x [[HALF]]>
  92 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], ptr %a)
  93 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
  94 // CHECK: ret void
  95 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
  96   vst1_f16_x4(a, b);
  97 }
  98
  99 // CHECK-LABEL: @test_vst1_f32_x2(
 100 // CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
 101 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
 102 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
 103 // CHECK-A64: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 104 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 105 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 106 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
 107 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 108 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
 109 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
 110 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
 111 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 112 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
 113 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
 114 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
 115 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
 116 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr %a)
 117 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2f32(ptr %a, <2 x float> [[TMP7]], <2 x float> [[TMP8]])
 118 // CHECK: ret void
 119 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
 120   vst1_f32_x2(a, b);
 121 }
 122
 123 // CHECK-LABEL: @test_vst1_f32_x3(
 124 // CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
 125 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
 126 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
 127 // CHECK-A64: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 128 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 129 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 130 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
 131 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 132 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
 133 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
 134 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
 135 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 136 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
 137 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
 138 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
 139 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 140 // CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
 141 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
 142 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
 143 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
 144 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
 145 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr %a)
 146 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2f32(ptr %a, <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]])
 147 // CHECK: ret void
 148 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
 149   vst1_f32_x3(a, b);
 150 }
 151
 152 // CHECK-LABEL: @test_vst1_f32_x4(
 153 // CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
 154 // CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
 155 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
 156 // CHECK-A64: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 157 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 158 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 159 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
 160 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 161 // CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
 162 // CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
 163 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
 164 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 165 // CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
 166 // CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
 167 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
 168 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 169 // CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
 170 // CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
 171 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
 172 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 173 // CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
 174 // CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
 175 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
 176 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
 177 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
 178 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
 179 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], ptr %a)
 180 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2f32(ptr %a, <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]])
 181 // CHECK: ret void
 182 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
 183   vst1_f32_x4(a, b);
 184 }
 185
 186 // CHECK-LABEL: @test_vst1_p16_x2(
 187 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
 188 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
 189 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
 190 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 191 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 192 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 193 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
 194 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 195 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 196 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 197 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
 198 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 199 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 200 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 201 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 202 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 203 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
 204 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
 205 // CHECK: ret void
 206 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
 207   vst1_p16_x2(a, b);
 208 }
 209
 210 // CHECK-LABEL: @test_vst1_p16_x3(
 211 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
 212 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
 213 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
 214 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 215 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 216 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 217 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
 218 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 219 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 220 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 221 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
 222 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 223 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 224 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 225 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
 226 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 227 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 228 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 229 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 230 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 231 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 232 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
 233 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
 234 // CHECK: ret void
 235 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
 236   vst1_p16_x3(a, b);
 237 }
 238
 239 // CHECK-LABEL: @test_vst1_p16_x4(
 240 // CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
 241 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
 242 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
 243 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 244 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 245 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 246 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
 247 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 248 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 249 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 250 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
 251 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 252 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 253 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 254 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
 255 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 256 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 257 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 258 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
 259 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 260 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
 261 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
 262 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 263 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 264 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 265 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 266 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
 267 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
 268 // CHECK: ret void
 269 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
 270   vst1_p16_x4(a, b);
 271 }
 272
 273 // CHECK-LABEL: @test_vst1_p8_x2(
 274 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
 275 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
 276 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
 277 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 278 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 279 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 280 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
 281 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 282 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 283 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
 284 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 285 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 286 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
 287 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
 288 // CHECK: ret void
 289 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
 290   vst1_p8_x2(a, b);
 291 }
 292
 293 // CHECK-LABEL: @test_vst1_p8_x3(
 294 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
 295 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
 296 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
 297 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 298 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 299 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 300 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
 301 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 302 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 303 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
 304 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 305 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 306 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
 307 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 308 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 309 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
 310 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
 311 // CHECK: ret void
 312 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
 313   vst1_p8_x3(a, b);
 314 }
 315
 316 // CHECK-LABEL: @test_vst1_p8_x4(
 317 // CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
 318 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
 319 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
 320 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 321 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 322 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 323 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
 324 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 325 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 326 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
 327 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 328 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 329 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
 330 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 331 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 332 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
 333 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 334 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
 335 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
 336 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
 337 // CHECK: ret void
 338 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
 339   vst1_p8_x4(a, b);
 340 }
 341
 342 // CHECK-LABEL: @test_vst1_s16_x2(
 343 // CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
 344 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
 345 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
 346 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 347 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 348 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 349 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
 350 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 351 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 352 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 353 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
 354 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 355 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 356 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 357 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 358 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 359 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
 360 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
 361 // CHECK: ret void
 362 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
 363   vst1_s16_x2(a, b);
 364 }
 365
 366 // CHECK-LABEL: @test_vst1_s16_x3(
 367 // CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
 368 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
 369 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
 370 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 371 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 372 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 373 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
 374 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 375 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 376 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 377 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
 378 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 379 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 380 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 381 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
 382 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 383 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 384 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 385 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 386 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 387 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 388 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
 389 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
 390 // CHECK: ret void
 391 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
 392   vst1_s16_x3(a, b);
 393 }
 394
 395 // CHECK-LABEL: @test_vst1_s16_x4(
 396 // CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
 397 // CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
 398 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
 399 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 400 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 401 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 402 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
 403 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 404 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 405 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 406 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
 407 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 408 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 409 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 410 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
 411 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 412 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 413 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 414 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
 415 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 416 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
 417 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
 418 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 419 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 420 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 421 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 422 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
 423 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
 424 // CHECK: ret void
 425 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
 426   vst1_s16_x4(a, b);
 427 }
 428
 429 // CHECK-LABEL: @test_vst1_s32_x2(
 430 // CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
 431 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
 432 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
 433 // CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 434 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 435 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 436 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
 437 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 438 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 439 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 440 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
 441 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 442 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 443 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 444 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 445 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 446 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a)
 447 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr %a, <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
 448 // CHECK: ret void
 449 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
 450   vst1_s32_x2(a, b);
 451 }
 452
 453 // CHECK-LABEL: @test_vst1_s32_x3(
 454 // CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
 455 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
 456 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
 457 // CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 458 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 459 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 460 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
 461 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 462 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 463 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 464 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
 465 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 466 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 467 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 468 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
 469 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 470 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
 471 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 472 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 473 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 474 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 475 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a)
 476 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr %a, <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
 477 // CHECK: ret void
 478 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
 479   vst1_s32_x3(a, b);
 480 }
 481
 482 // CHECK-LABEL: @test_vst1_s32_x4(
 483 // CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
 484 // CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
 485 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
 486 // CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 487 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 488 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 489 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
 490 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 491 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 492 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 493 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
 494 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 495 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 496 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 497 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
 498 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 499 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
 500 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 501 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
 502 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 503 // CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
 504 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
 505 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 506 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 507 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 508 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
 509 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a)
 510 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr %a, <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
 511 // CHECK: ret void
 512 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
 513   vst1_s32_x4(a, b);
 514 }
 515
 516 // CHECK-LABEL: @test_vst1_s64_x2(
 517 // CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
 518 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
 519 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
 520 // CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 521 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 522 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 523 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
 524 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 525 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 526 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 527 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
 528 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 529 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 530 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 531 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 532 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 533 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a)
 534 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr %a, <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
 535 // CHECK: ret void
 536 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
 537   vst1_s64_x2(a, b);
 538 }
 539
 540 // CHECK-LABEL: @test_vst1_s64_x3(
 541 // CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
 542 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
 543 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
 544 // CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 545 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 546 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 547 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
 548 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 549 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 550 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 551 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
 552 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 553 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 554 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 555 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
 556 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 557 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
 558 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 559 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 560 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 561 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
 562 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a)
 563 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr %a, <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
 564 // CHECK: ret void
 565 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
 566   vst1_s64_x3(a, b);
 567 }
 568
 569 // CHECK-LABEL: @test_vst1_s64_x4(
 570 // CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
 571 // CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
 572 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
 573 // CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 574 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 575 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 576 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
 577 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 578 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 579 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 580 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
 581 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 582 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 583 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 584 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
 585 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 586 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
 587 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 588 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
 589 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 590 // CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
 591 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
 592 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 593 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 594 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
 595 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
 596 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a)
 597 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr %a, <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
 598 // CHECK: ret void
 599 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
 600   vst1_s64_x4(a, b);
 601 }
 602
 603 // CHECK-LABEL: @test_vst1_s8_x2(
 604 // CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
 605 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
 606 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
 607 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 608 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 609 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 610 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
 611 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 612 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 613 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
 614 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 615 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 616 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
 617 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
 618 // CHECK: ret void
 619 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
 620   vst1_s8_x2(a, b);
 621 }
 622
 623 // CHECK-LABEL: @test_vst1_s8_x3(
 624 // CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
 625 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
 626 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
 627 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 628 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 629 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 630 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
 631 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 632 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 633 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
 634 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 635 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 636 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
 637 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 638 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 639 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
 640 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
 641 // CHECK: ret void
 642 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
 643   vst1_s8_x3(a, b);
 644 }
 645
 646 // CHECK-LABEL: @test_vst1_s8_x4(
 647 // CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
 648 // CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
 649 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
 650 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 651 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 652 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 653 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
 654 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 655 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 656 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
 657 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 658 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 659 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
 660 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 661 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 662 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
 663 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 664 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
 665 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
 666 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
 667 // CHECK: ret void
 668 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
 669   vst1_s8_x4(a, b);
 670 }
 671
 672 // CHECK-LABEL: @test_vst1_u16_x2(
 673 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
 674 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
 675 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
 676 // CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 677 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 678 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 679 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
 680 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 681 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 682 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 683 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
 684 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 685 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 686 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 687 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 688 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 689 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
 690 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
 691 // CHECK: ret void
 692 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
 693   vst1_u16_x2(a, b);
 694 }
 695
 696 // CHECK-LABEL: @test_vst1_u16_x3(
 697 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
 698 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
 699 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
 700 // CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 701 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 702 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 703 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
 704 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 705 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 706 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 707 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
 708 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 709 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 710 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 711 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
 712 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 713 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 714 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 715 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 716 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 717 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 718 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
 719 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
 720 // CHECK: ret void
 721 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
 722   vst1_u16_x3(a, b);
 723 }
 724
 725 // CHECK-LABEL: @test_vst1_u16_x4(
 726 // CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
 727 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
 728 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
 729 // CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 730 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 731 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 732 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
 733 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 734 // CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
 735 // CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
 736 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
 737 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 738 // CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
 739 // CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
 740 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
 741 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 742 // CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
 743 // CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
 744 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
 745 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 746 // CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
 747 // CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
 748 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
 749 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
 750 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
 751 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
 752 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
 753 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
 754 // CHECK: ret void
 755 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
 756   vst1_u16_x4(a, b);
 757 }
 758
 759 // CHECK-LABEL: @test_vst1_u32_x2(
 760 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
 761 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
 762 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
 763 // CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 764 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 765 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 766 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
 767 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 768 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 769 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 770 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
 771 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 772 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 773 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 774 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 775 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 776 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a)
 777 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr %a, <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
 778 // CHECK: ret void
 779 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
 780   vst1_u32_x2(a, b);
 781 }
 782
 783 // CHECK-LABEL: @test_vst1_u32_x3(
 784 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
 785 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
 786 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
 787 // CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 788 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 789 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 790 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
 791 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 792 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 793 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 794 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
 795 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 796 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 797 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 798 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
 799 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 800 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
 801 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 802 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 803 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 804 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 805 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a)
 806 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr %a, <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
 807 // CHECK: ret void
 808 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
 809   vst1_u32_x3(a, b);
 810 }
 811
 812 // CHECK-LABEL: @test_vst1_u32_x4(
 813 // CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
 814 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
 815 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
 816 // CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 817 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 818 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 819 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
 820 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 821 // CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
 822 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
 823 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
 824 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 825 // CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
 826 // CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
 827 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
 828 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 829 // CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
 830 // CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
 831 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
 832 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 833 // CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
 834 // CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
 835 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
 836 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
 837 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
 838 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
 839 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a)
 840 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr %a, <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
 841 // CHECK: ret void
 842 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
 843   vst1_u32_x4(a, b);
 844 }
 845
 846 // CHECK-LABEL: @test_vst1_u64_x2(
 847 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
 848 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
 849 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
 850 // CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 851 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 852 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 853 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
 854 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 855 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 856 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 857 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
 858 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 859 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 860 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 861 // CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 862 // CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 863 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a)
 864 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr %a, <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
 865 // CHECK: ret void
 866 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
 867   vst1_u64_x2(a, b);
 868 }
 869
 870 // CHECK-LABEL: @test_vst1_u64_x3(
 871 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
 872 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
 873 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
 874 // CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 875 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 876 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 877 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
 878 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 879 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 880 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 881 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
 882 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 883 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 884 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 885 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
 886 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 887 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
 888 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 889 // CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 890 // CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 891 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
 892 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a)
 893 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr %a, <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
 894 // CHECK: ret void
 895 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
 896   vst1_u64_x3(a, b);
 897 }
 898
 899 // CHECK-LABEL: @test_vst1_u64_x4(
 900 // CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
 901 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
 902 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
 903 // CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 904 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 905 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 906 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
 907 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 908 // CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
 909 // CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
 910 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
 911 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 912 // CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
 913 // CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
 914 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
 915 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 916 // CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
 917 // CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 918 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
 919 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 920 // CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
 921 // CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
 922 // CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 923 // CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
 924 // CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
 925 // CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
 926 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a)
 927 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr %a, <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
 928 // CHECK: ret void
 929 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
 930   vst1_u64_x4(a, b);
 931 }
 932
 933 // CHECK-LABEL: @test_vst1_u8_x2(
 934 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
 935 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
 936 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
 937 // CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 938 // CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8
 939 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false)
 940 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
 941 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 942 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 943 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
 944 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 945 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 946 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
 947 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
 948 // CHECK: ret void
 949 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
 950   vst1_u8_x2(a, b);
 951 }
 952
 953 // CHECK-LABEL: @test_vst1_u8_x3(
 954 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
 955 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
 956 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
 957 // CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 958 // CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8
 959 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false)
 960 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
 961 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 962 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 963 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
 964 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 965 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 966 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
 967 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 968 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 969 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
 970 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
 971 // CHECK: ret void
 972 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
 973   vst1_u8_x3(a, b);
 974 }
 975
 976 // CHECK-LABEL: @test_vst1_u8_x4(
 977 // CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
 978 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
 979 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
 980 // CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
 981 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
 982 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false)
 983 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
 984 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
 985 // CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
 986 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
 987 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
 988 // CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
 989 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
 990 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
 991 // CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
 992 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
 993 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
 994 // CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
 995 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
 996 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
 997 // CHECK: ret void
 998 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
 999   vst1_u8_x4(a, b);
1000 }
1001
1002 // CHECK-LABEL: @test_vst1q_f16_x2(
1003 // CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN:(16|8)]]
1004 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN]]
1005 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
1006 // CHECK-A64: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1007 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1008 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1009 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1010 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1011 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]]
1012 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1013 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
1014 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1015 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1016 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1017 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1018 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1019 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr %a)
1020 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1021 // CHECK: ret void
1022 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
1023   vst1q_f16_x2(a, b);
1024 }
1025
1026 // CHECK-LABEL: @test_vst1q_f16_x3(
1027 // CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1028 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1029 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
1030 // CHECK-A64: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1031 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1032 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1033 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1034 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1035 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]]
1036 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1037 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1038 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1039 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1040 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1041 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
1042 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1043 // CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1044 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1045 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1046 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1047 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1048 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr %a)
1049 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1050 // CHECK: ret void
1051 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
1052   vst1q_f16_x3(a, b);
1053 }
1054
1055 // CHECK-LABEL: @test_vst1q_f16_x4(
1056 // CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1057 // CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1058 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
1059 // CHECK-A64: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1060 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1061 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1062 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
1063 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1064 // CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]]
1065 // CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1066 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
1067 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1068 // CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1069 // CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1070 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
1071 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1072 // CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1073 // CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1074 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
1075 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1076 // CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1077 // CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
1078 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1079 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1080 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1081 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x [[HALF]]>
1082 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], ptr %a)
1083 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1084 // CHECK: ret void
1085 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
1086   vst1q_f16_x4(a, b);
1087 }
1088
1089 // CHECK-LABEL: @test_vst1q_f32_x2(
1090 // CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1091 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1092 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
1093 // CHECK-A64: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1094 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1095 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1096 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1097 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1098 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]]
1099 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1100 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
1101 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1102 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1103 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1104 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1105 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1106 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr %a)
1107 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4f32(ptr %a, <4 x float> [[TMP7]], <4 x float> [[TMP8]])
1108 // CHECK: ret void
1109 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
1110   vst1q_f32_x2(a, b);
1111 }
1112
1113 // CHECK-LABEL: @test_vst1q_f32_x3(
1114 // CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1115 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1116 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
1117 // CHECK-A64: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1118 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1119 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1120 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1121 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1122 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]]
1123 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1124 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1125 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1126 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1127 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1128 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
1129 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1130 // CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1131 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1132 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1133 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1134 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1135 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr %a)
1136 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4f32(ptr %a, <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
1137 // CHECK: ret void
1138 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
1139   vst1q_f32_x3(a, b);
1140 }
1141
1142 // CHECK-LABEL: @test_vst1q_f32_x4(
1143 // CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1144 // CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1145 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
1146 // CHECK-A64: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1147 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1148 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1149 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
1150 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1151 // CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]]
1152 // CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1153 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
1154 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1155 // CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1156 // CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1157 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
1158 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1159 // CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1160 // CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1161 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
1162 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1163 // CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1164 // CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
1165 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1166 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1167 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1168 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
1169 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], ptr %a)
1170 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4f32(ptr %a, <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]])
1171 // CHECK: ret void
1172 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
1173   vst1q_f32_x4(a, b);
1174 }
1175
1176 // CHECK-LABEL: @test_vst1q_p16_x2(
1177 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1178 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1179 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
1180 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1181 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1182 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1183 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1184 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1185 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1186 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1187 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
1188 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1189 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1190 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1191 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1192 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1193 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
1194 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1195 // CHECK: ret void
1196 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
1197   vst1q_p16_x2(a, b);
1198 }
1199
1200 // CHECK-LABEL: @test_vst1q_p16_x3(
1201 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1202 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1203 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
1204 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1205 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1206 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1207 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1208 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1209 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1210 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1211 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1212 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1213 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1214 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1215 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
1216 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1217 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1218 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1219 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1220 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1221 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1222 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
1223 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1224 // CHECK: ret void
1225 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
1226   vst1q_p16_x3(a, b);
1227 }
1228
1229 // CHECK-LABEL: @test_vst1q_p16_x4(
1230 // CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1231 // CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1232 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
1233 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1234 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1235 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1236 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
1237 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1238 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1239 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1240 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
1241 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1242 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1243 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1244 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
1245 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1246 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1247 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1248 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
1249 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1250 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1251 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1252 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1253 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1254 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1255 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1256 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
1257 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1258 // CHECK: ret void
1259 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
1260   vst1q_p16_x4(a, b);
1261 }
1262
1263 // CHECK-LABEL: @test_vst1q_p8_x2(
1264 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1265 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1266 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
1267 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1268 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1269 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1270 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
1271 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1272 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1273 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
1274 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1275 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1276 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
1277 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1278 // CHECK: ret void
1279 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
1280   vst1q_p8_x2(a, b);
1281 }
1282
1283 // CHECK-LABEL: @test_vst1q_p8_x3(
1284 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1285 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1286 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
1287 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1288 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1289 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1290 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1291 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1292 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1293 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1294 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1295 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1296 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
1297 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1298 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1299 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
1300 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1301 // CHECK: ret void
1302 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
1303   vst1q_p8_x3(a, b);
1304 }
1305
1306 // CHECK-LABEL: @test_vst1q_p8_x4(
1307 // CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1308 // CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1309 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
1310 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1311 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1312 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1313 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
1314 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1315 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1316 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
1317 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1318 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1319 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
1320 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1321 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1322 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
1323 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1324 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1325 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
1326 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1327 // CHECK: ret void
1328 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
1329   vst1q_p8_x4(a, b);
1330 }
1331
1332 // CHECK-LABEL: @test_vst1q_s16_x2(
1333 // CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1334 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1335 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
1336 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1337 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1338 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1339 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
1340 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1341 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1342 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1343 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
1344 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1345 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1346 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1347 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1348 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1349 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
1350 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1351 // CHECK: ret void
1352 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
1353   vst1q_s16_x2(a, b);
1354 }
1355
1356 // CHECK-LABEL: @test_vst1q_s16_x3(
1357 // CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1358 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1359 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
1360 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1361 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1362 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1363 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1364 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1365 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1366 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1367 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1368 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1369 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1370 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1371 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
1372 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1373 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1374 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1375 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1376 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1377 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1378 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
1379 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1380 // CHECK: ret void
1381 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
1382   vst1q_s16_x3(a, b);
1383 }
1384
1385 // CHECK-LABEL: @test_vst1q_s16_x4(
1386 // CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1387 // CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1388 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
1389 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1390 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1391 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1392 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
1393 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1394 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1395 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1396 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
1397 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1398 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1399 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1400 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
1401 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1402 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1403 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1404 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
1405 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1406 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1407 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1408 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1409 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1410 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1411 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1412 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
1413 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1414 // CHECK: ret void
1415 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
1416   vst1q_s16_x4(a, b);
1417 }
1418
1419 // CHECK-LABEL: @test_vst1q_s32_x2(
1420 // CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1421 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1422 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
1423 // CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1424 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1425 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1426 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
1427 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1428 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1429 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1430 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
1431 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1432 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1433 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1434 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1435 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1436 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a)
1437 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr %a, <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
1438 // CHECK: ret void
1439 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
1440   vst1q_s32_x2(a, b);
1441 }
1442
1443 // CHECK-LABEL: @test_vst1q_s32_x3(
1444 // CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1445 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1446 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
1447 // CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1448 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1449 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1450 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1451 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1452 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1453 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1454 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1455 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1456 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1457 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1458 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
1459 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1460 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1461 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1462 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1463 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1464 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1465 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a)
1466 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr %a, <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
1467 // CHECK: ret void
1468 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
1469   vst1q_s32_x3(a, b);
1470 }
1471
1472 // CHECK-LABEL: @test_vst1q_s32_x4(
1473 // CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1474 // CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1475 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
1476 // CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1477 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1478 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1479 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
1480 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1481 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1482 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1483 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
1484 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1485 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1486 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1487 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
1488 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1489 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1490 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1491 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
1492 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1493 // CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1494 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
1495 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1496 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1497 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1498 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1499 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a)
1500 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr %a, <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
1501 // CHECK: ret void
1502 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
1503   vst1q_s32_x4(a, b);
1504 }
1505
1506 // CHECK-LABEL: @test_vst1q_s64_x2(
1507 // CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1508 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1509 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
1510 // CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1511 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1512 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1513 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1514 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1515 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1516 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1517 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
1518 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1519 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1520 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1521 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1522 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1523 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a)
1524 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr %a, <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
1525 // CHECK: ret void
1526 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
1527   vst1q_s64_x2(a, b);
1528 }
1529
1530 // CHECK-LABEL: @test_vst1q_s64_x3(
1531 // CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1532 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1533 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
1534 // CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1535 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1536 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1537 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1538 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1539 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1540 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1541 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1542 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1543 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1544 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1545 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
1546 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1547 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1548 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1549 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1550 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1551 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1552 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a)
1553 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr %a, <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
1554 // CHECK: ret void
1555 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
1556   vst1q_s64_x3(a, b);
1557 }
1558
1559 // CHECK-LABEL: @test_vst1q_s64_x4(
1560 // CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1561 // CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1562 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
1563 // CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1564 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1565 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1566 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
1567 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1568 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1569 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1570 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
1571 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1572 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1573 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1574 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
1575 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1576 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1577 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1578 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
1579 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1580 // CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1581 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
1582 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1583 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1584 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1585 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
1586 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a)
1587 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr %a, <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
1588 // CHECK: ret void
1589 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
1590   vst1q_s64_x4(a, b);
1591 }
1592
1593 // CHECK-LABEL: @test_vst1q_s8_x2(
1594 // CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1595 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1596 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
1597 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1598 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1599 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1600 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
1601 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1602 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1603 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
1604 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1605 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1606 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
1607 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1608 // CHECK: ret void
1609 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
1610   vst1q_s8_x2(a, b);
1611 }
1612
1613 // CHECK-LABEL: @test_vst1q_s8_x3(
1614 // CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1615 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1616 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
1617 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1618 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1619 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1620 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
1621 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1622 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1623 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
1624 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1625 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1626 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
1627 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1628 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1629 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
1630 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1631 // CHECK: ret void
1632 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
1633   vst1q_s8_x3(a, b);
1634 }
1635
1636 // CHECK-LABEL: @test_vst1q_s8_x4(
1637 // CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1638 // CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1639 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
1640 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1641 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1642 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1643 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
1644 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1645 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1646 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
1647 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1648 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1649 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
1650 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1651 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1652 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
1653 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1654 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1655 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
1656 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1657 // CHECK: ret void
1658 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
1659   vst1q_s8_x4(a, b);
1660 }
1661
1662 // CHECK-LABEL: @test_vst1q_u16_x2(
1663 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1664 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1665 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
1666 // CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1667 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1668 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1669 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
1670 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1671 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1672 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1673 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
1674 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1675 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1676 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1677 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1678 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1679 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
1680 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1681 // CHECK: ret void
1682 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
1683   vst1q_u16_x2(a, b);
1684 }
1685
1686 // CHECK-LABEL: @test_vst1q_u16_x3(
1687 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1688 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1689 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
1690 // CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1691 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1692 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1693 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1694 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1695 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1696 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1697 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1698 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1699 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1700 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1701 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
1702 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1703 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1704 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1705 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1706 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1707 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1708 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
1709 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1710 // CHECK: ret void
1711 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
1712   vst1q_u16_x3(a, b);
1713 }
1714
1715 // CHECK-LABEL: @test_vst1q_u16_x4(
1716 // CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1717 // CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1718 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
1719 // CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1720 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1721 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1722 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
1723 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1724 // CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]]
1725 // CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1726 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
1727 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1728 // CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1729 // CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1730 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
1731 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1732 // CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1733 // CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1734 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
1735 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1736 // CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1737 // CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1738 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1739 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1740 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1741 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1742 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
1743 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1744 // CHECK: ret void
1745 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
1746   vst1q_u16_x4(a, b);
1747 }
1748
1749 // CHECK-LABEL: @test_vst1q_u32_x2(
1750 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
1751 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
1752 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
1753 // CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1754 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1755 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1756 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
1757 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1758 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1759 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1760 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
1761 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1762 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1763 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1764 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1765 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1766 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a)
1767 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr %a, <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
1768 // CHECK: ret void
1769 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
1770   vst1q_u32_x2(a, b);
1771 }
1772
1773 // CHECK-LABEL: @test_vst1q_u32_x3(
1774 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
1775 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
1776 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
1777 // CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1778 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1779 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1780 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1781 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1782 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1783 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1784 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1785 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1786 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1787 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1788 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
1789 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1790 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1791 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1792 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1793 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1794 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1795 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a)
1796 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr %a, <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
1797 // CHECK: ret void
1798 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
1799   vst1q_u32_x3(a, b);
1800 }
1801
1802 // CHECK-LABEL: @test_vst1q_u32_x4(
1803 // CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
1804 // CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
1805 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
1806 // CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1807 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1808 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1809 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
1810 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1811 // CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]]
1812 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1813 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
1814 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1815 // CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1816 // CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1817 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
1818 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1819 // CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1820 // CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1821 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
1822 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1823 // CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1824 // CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
1825 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1826 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1827 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1828 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1829 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a)
1830 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr %a, <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
1831 // CHECK: ret void
1832 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
1833   vst1q_u32_x4(a, b);
1834 }
1835
1836 // CHECK-LABEL: @test_vst1q_u64_x2(
1837 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
1838 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
1839 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
1840 // CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1841 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1842 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1843 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
1844 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1845 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1846 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1847 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
1848 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1849 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1850 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1851 // CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1852 // CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1853 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a)
1854 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr %a, <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
1855 // CHECK: ret void
1856 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
1857   vst1q_u64_x2(a, b);
1858 }
1859
1860 // CHECK-LABEL: @test_vst1q_u64_x3(
1861 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
1862 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
1863 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
1864 // CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1865 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1866 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1867 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1868 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1869 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1870 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1871 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1872 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1873 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1874 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1875 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
1876 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1877 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1878 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1879 // CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1880 // CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1881 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1882 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a)
1883 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr %a, <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
1884 // CHECK: ret void
1885 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
1886   vst1q_u64_x3(a, b);
1887 }
1888
1889 // CHECK-LABEL: @test_vst1q_u64_x4(
1890 // CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
1891 // CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
1892 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
1893 // CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1894 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1895 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1896 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
1897 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1898 // CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]]
1899 // CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1900 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
1901 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1902 // CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1903 // CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1904 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
1905 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1906 // CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1907 // CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1908 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
1909 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1910 // CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1911 // CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
1912 // CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1913 // CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1914 // CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1915 // CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
1916 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a)
1917 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr %a, <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
1918 // CHECK: ret void
1919 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
1920   vst1q_u64_x4(a, b);
1921 }
1922
1923 // CHECK-LABEL: @test_vst1q_u8_x2(
1924 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
1925 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
1926 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
1927 // CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1928 // CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8
1929 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false)
1930 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
1931 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1932 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1933 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
1934 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1935 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1936 // CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
1937 // CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1938 // CHECK: ret void
1939 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
1940   vst1q_u8_x2(a, b);
1941 }
1942
1943 // CHECK-LABEL: @test_vst1q_u8_x3(
1944 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
1945 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
1946 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
1947 // CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1948 // CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8
1949 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false)
1950 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
1951 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1952 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1953 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
1954 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1955 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1956 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
1957 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1958 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1959 // CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
1960 // CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1961 // CHECK: ret void
1962 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
1963   vst1q_u8_x3(a, b);
1964 }
1965
1966 // CHECK-LABEL: @test_vst1q_u8_x4(
1967 // CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
1968 // CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
1969 // CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
1970 // CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
1971 // CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8
1972 // CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false)
1973 // CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
1974 // CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
1975 // CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]]
1976 // CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
1977 // CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
1978 // CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]]
1979 // CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
1980 // CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
1981 // CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]]
1982 // CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
1983 // CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
1984 // CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]]
1985 // CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
1986 // CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1987 // CHECK: ret void
1988 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
1989   vst1q_u8_x4(a, b);
1990 }