clang/test/CodeGen/arm_neon_intrinsics.c

   1 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
   2 // RUN:  -target-cpu swift \
   3 // RUN:  -target-feature +fullfp16 -ffreestanding \
   4 // RUN:  -flax-vector-conversions=none \
   5 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
   6 // RUN:  | opt -S -passes=mem2reg | FileCheck %s
   7
   8 // REQUIRES: aarch64-registered-target || arm-registered-target
   9
  10 #include <arm_neon.h>
  11
  12 // CHECK-LABEL: @test_vaba_s8(
  13 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
  14 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
  15 // CHECK:   ret <8 x i8> [[ADD_I]]
  16 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
  17   return vaba_s8(a, b, c);
  18 }
  19
  20 // CHECK-LABEL: @test_vaba_s16(
  21 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
  22 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
  23 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
  24 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
  25 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
  26 // CHECK:   ret <4 x i16> [[ADD_I]]
  27 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
  28   return vaba_s16(a, b, c);
  29 }
  30
  31 // CHECK-LABEL: @test_vaba_s32(
  32 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
  33 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
  34 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
  35 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
  36 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
  37 // CHECK:   ret <2 x i32> [[ADD_I]]
  38 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
  39   return vaba_s32(a, b, c);
  40 }
  41
  42 // CHECK-LABEL: @test_vaba_u8(
  43 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
  44 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
  45 // CHECK:   ret <8 x i8> [[ADD_I]]
  46 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
  47   return vaba_u8(a, b, c);
  48 }
  49
  50 // CHECK-LABEL: @test_vaba_u16(
  51 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
  52 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
  53 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
  54 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
  55 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
  56 // CHECK:   ret <4 x i16> [[ADD_I]]
  57 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
  58   return vaba_u16(a, b, c);
  59 }
  60
  61 // CHECK-LABEL: @test_vaba_u32(
  62 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
  63 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
  64 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
  65 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
  66 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
  67 // CHECK:   ret <2 x i32> [[ADD_I]]
  68 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
  69   return vaba_u32(a, b, c);
  70 }
  71
  72 // CHECK-LABEL: @test_vabaq_s8(
  73 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c)
  74 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
  75 // CHECK:   ret <16 x i8> [[ADD_I]]
  76 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
  77   return vabaq_s8(a, b, c);
  78 }
  79
  80 // CHECK-LABEL: @test_vabaq_s16(
  81 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
  82 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
  83 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c)
  84 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
  85 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
  86 // CHECK:   ret <8 x i16> [[ADD_I]]
  87 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
  88   return vabaq_s16(a, b, c);
  89 }
  90
  91 // CHECK-LABEL: @test_vabaq_s32(
  92 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
  93 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
  94 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c)
  95 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
  96 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
  97 // CHECK:   ret <4 x i32> [[ADD_I]]
  98 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
  99   return vabaq_s32(a, b, c);
 100 }
 101
 102 // CHECK-LABEL: @test_vabaq_u8(
 103 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c)
 104 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
 105 // CHECK:   ret <16 x i8> [[ADD_I]]
 106 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
 107   return vabaq_u8(a, b, c);
 108 }
 109
 110 // CHECK-LABEL: @test_vabaq_u16(
 111 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 112 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
 113 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c)
 114 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
 115 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
 116 // CHECK:   ret <8 x i16> [[ADD_I]]
 117 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
 118   return vabaq_u16(a, b, c);
 119 }
 120
 121 // CHECK-LABEL: @test_vabaq_u32(
 122 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 123 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
 124 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c)
 125 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
 126 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
 127 // CHECK:   ret <4 x i32> [[ADD_I]]
 128 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
 129   return vabaq_u32(a, b, c);
 130 }
 131
 132 // CHECK-LABEL: @test_vabal_s8(
 133 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
 134 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
 135 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
 136 // CHECK:   ret <8 x i16> [[ADD_I]]
 137 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
 138   return vabal_s8(a, b, c);
 139 }
 140
 141 // CHECK-LABEL: @test_vabal_s16(
 142 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 143 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 144 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
 145 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
 146 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
 147 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
 148 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 149 // CHECK:   ret <4 x i32> [[ADD_I]]
 150 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 151   return vabal_s16(a, b, c);
 152 }
 153
 154 // CHECK-LABEL: @test_vabal_s32(
 155 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 156 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 157 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
 158 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
 159 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
 160 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
 161 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 162 // CHECK:   ret <2 x i64> [[ADD_I]]
 163 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 164   return vabal_s32(a, b, c);
 165 }
 166
 167 // CHECK-LABEL: @test_vabal_u8(
 168 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
 169 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
 170 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
 171 // CHECK:   ret <8 x i16> [[ADD_I]]
 172 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
 173   return vabal_u8(a, b, c);
 174 }
 175
 176 // CHECK-LABEL: @test_vabal_u16(
 177 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 178 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 179 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
 180 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
 181 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
 182 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
 183 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
 184 // CHECK:   ret <4 x i32> [[ADD_I]]
 185 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 186   return vabal_u16(a, b, c);
 187 }
 188
 189 // CHECK-LABEL: @test_vabal_u32(
 190 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 191 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 192 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
 193 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
 194 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
 195 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
 196 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
 197 // CHECK:   ret <2 x i64> [[ADD_I]]
 198 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 199   return vabal_u32(a, b, c);
 200 }
 201
 202 // CHECK-LABEL: @test_vabd_s8(
 203 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
 204 // CHECK:   ret <8 x i8> [[VABD_V_I]]
 205 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
 206   return vabd_s8(a, b);
 207 }
 208
 209 // CHECK-LABEL: @test_vabd_s16(
 210 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 211 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 212 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
 213 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 214 // CHECK:   ret <4 x i16> [[VABD_V2_I]]
 215 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
 216   return vabd_s16(a, b);
 217 }
 218
 219 // CHECK-LABEL: @test_vabd_s32(
 220 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 221 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 222 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
 223 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 224 // CHECK:   ret <2 x i32> [[VABD_V2_I]]
 225 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
 226   return vabd_s32(a, b);
 227 }
 228
 229 // CHECK-LABEL: @test_vabd_u8(
 230 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
 231 // CHECK:   ret <8 x i8> [[VABD_V_I]]
 232 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
 233   return vabd_u8(a, b);
 234 }
 235
 236 // CHECK-LABEL: @test_vabd_u16(
 237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 238 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 239 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
 240 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 241 // CHECK:   ret <4 x i16> [[VABD_V2_I]]
 242 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
 243   return vabd_u16(a, b);
 244 }
 245
 246 // CHECK-LABEL: @test_vabd_u32(
 247 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 248 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 249 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
 250 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 251 // CHECK:   ret <2 x i32> [[VABD_V2_I]]
 252 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
 253   return vabd_u32(a, b);
 254 }
 255
 256 // CHECK-LABEL: @test_vabd_f32(
 257 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 258 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 259 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b)
 260 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
 261 // CHECK:   ret <2 x float> [[VABD_V2_I]]
 262 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
 263   return vabd_f32(a, b);
 264 }
 265
 266 // CHECK-LABEL: @test_vabdq_s8(
 267 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b)
 268 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 269 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
 270   return vabdq_s8(a, b);
 271 }
 272
 273 // CHECK-LABEL: @test_vabdq_s16(
 274 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 275 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 276 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b)
 277 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 278 // CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 279 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
 280   return vabdq_s16(a, b);
 281 }
 282
 283 // CHECK-LABEL: @test_vabdq_s32(
 284 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 285 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 286 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b)
 287 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 288 // CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 289 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
 290   return vabdq_s32(a, b);
 291 }
 292
 293 // CHECK-LABEL: @test_vabdq_u8(
 294 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b)
 295 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 296 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
 297   return vabdq_u8(a, b);
 298 }
 299
 300 // CHECK-LABEL: @test_vabdq_u16(
 301 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 302 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 303 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b)
 304 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 305 // CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
 306 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
 307   return vabdq_u16(a, b);
 308 }
 309
 310 // CHECK-LABEL: @test_vabdq_u32(
 311 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 312 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 313 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b)
 314 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 315 // CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
 316 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
 317   return vabdq_u32(a, b);
 318 }
 319
 320 // CHECK-LABEL: @test_vabdq_f32(
 321 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 322 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 323 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b)
 324 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
 325 // CHECK:   ret <4 x float> [[VABDQ_V2_I]]
 326 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
 327   return vabdq_f32(a, b);
 328 }
 329
 330 // CHECK-LABEL: @test_vabdl_s8(
 331 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
 332 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
 333 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 334 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
 335   return vabdl_s8(a, b);
 336 }
 337
 338 // CHECK-LABEL: @test_vabdl_s16(
 339 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 340 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 341 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
 342 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 343 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 344 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 345 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 346 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
 347   return vabdl_s16(a, b);
 348 }
 349
 350 // CHECK-LABEL: @test_vabdl_s32(
 351 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 352 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 353 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
 354 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 355 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 356 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 357 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 358 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
 359   return vabdl_s32(a, b);
 360 }
 361
 362 // CHECK-LABEL: @test_vabdl_u8(
 363 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
 364 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
 365 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 366 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
 367   return vabdl_u8(a, b);
 368 }
 369
 370 // CHECK-LABEL: @test_vabdl_u16(
 371 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 372 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 373 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
 374 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 375 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 376 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 377 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 378 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
 379   return vabdl_u16(a, b);
 380 }
 381
 382 // CHECK-LABEL: @test_vabdl_u32(
 383 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 384 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 385 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
 386 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 387 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 388 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 389 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 390 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
 391   return vabdl_u32(a, b);
 392 }
 393
 394 // CHECK-LABEL: @test_vabs_s8(
 395 // CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a)
 396 // CHECK:   ret <8 x i8> [[VABS_I]]
 397 int8x8_t test_vabs_s8(int8x8_t a) {
 398   return vabs_s8(a);
 399 }
 400
 401 // CHECK-LABEL: @test_vabs_s16(
 402 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 403 // CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a)
 404 // CHECK:   ret <4 x i16> [[VABS1_I]]
 405 int16x4_t test_vabs_s16(int16x4_t a) {
 406   return vabs_s16(a);
 407 }
 408
 409 // CHECK-LABEL: @test_vabs_s32(
 410 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 411 // CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a)
 412 // CHECK:   ret <2 x i32> [[VABS1_I]]
 413 int32x2_t test_vabs_s32(int32x2_t a) {
 414   return vabs_s32(a);
 415 }
 416
 417 // CHECK-LABEL: @test_vabs_f32(
 418 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 419 // CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
 420 // CHECK:   ret <2 x float> [[VABS1_I]]
 421 float32x2_t test_vabs_f32(float32x2_t a) {
 422   return vabs_f32(a);
 423 }
 424
 425 // CHECK-LABEL: @test_vabsq_s8(
 426 // CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a)
 427 // CHECK:   ret <16 x i8> [[VABS_I]]
 428 int8x16_t test_vabsq_s8(int8x16_t a) {
 429   return vabsq_s8(a);
 430 }
 431
 432 // CHECK-LABEL: @test_vabsq_s16(
 433 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 434 // CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a)
 435 // CHECK:   ret <8 x i16> [[VABS1_I]]
 436 int16x8_t test_vabsq_s16(int16x8_t a) {
 437   return vabsq_s16(a);
 438 }
 439
 440 // CHECK-LABEL: @test_vabsq_s32(
 441 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 442 // CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a)
 443 // CHECK:   ret <4 x i32> [[VABS1_I]]
 444 int32x4_t test_vabsq_s32(int32x4_t a) {
 445   return vabsq_s32(a);
 446 }
 447
 448 // CHECK-LABEL: @test_vabsq_f32(
 449 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 450 // CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
 451 // CHECK:   ret <4 x float> [[VABS1_I]]
 452 float32x4_t test_vabsq_f32(float32x4_t a) {
 453   return vabsq_f32(a);
 454 }
 455
 456 // CHECK-LABEL: @test_vadd_s8(
 457 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
 458 // CHECK:   ret <8 x i8> [[ADD_I]]
 459 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
 460   return vadd_s8(a, b);
 461 }
 462
 463 // CHECK-LABEL: @test_vadd_s16(
 464 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
 465 // CHECK:   ret <4 x i16> [[ADD_I]]
 466 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
 467   return vadd_s16(a, b);
 468 }
 469
 470 // CHECK-LABEL: @test_vadd_s32(
 471 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
 472 // CHECK:   ret <2 x i32> [[ADD_I]]
 473 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
 474   return vadd_s32(a, b);
 475 }
 476
 477 // CHECK-LABEL: @test_vadd_s64(
 478 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
 479 // CHECK:   ret <1 x i64> [[ADD_I]]
 480 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
 481   return vadd_s64(a, b);
 482 }
 483
 484 // CHECK-LABEL: @test_vadd_f32(
 485 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
 486 // CHECK:   ret <2 x float> [[ADD_I]]
 487 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
 488   return vadd_f32(a, b);
 489 }
 490
 491 // CHECK-LABEL: @test_vadd_u8(
 492 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
 493 // CHECK:   ret <8 x i8> [[ADD_I]]
 494 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
 495   return vadd_u8(a, b);
 496 }
 497
 498 // CHECK-LABEL: @test_vadd_u16(
 499 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
 500 // CHECK:   ret <4 x i16> [[ADD_I]]
 501 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
 502   return vadd_u16(a, b);
 503 }
 504
 505 // CHECK-LABEL: @test_vadd_u32(
 506 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
 507 // CHECK:   ret <2 x i32> [[ADD_I]]
 508 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
 509   return vadd_u32(a, b);
 510 }
 511
 512 // CHECK-LABEL: @test_vadd_u64(
 513 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
 514 // CHECK:   ret <1 x i64> [[ADD_I]]
 515 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
 516   return vadd_u64(a, b);
 517 }
 518
 519 // CHECK-LABEL: @test_vaddq_s8(
 520 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
 521 // CHECK:   ret <16 x i8> [[ADD_I]]
 522 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
 523   return vaddq_s8(a, b);
 524 }
 525
 526 // CHECK-LABEL: @test_vaddq_s16(
 527 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
 528 // CHECK:   ret <8 x i16> [[ADD_I]]
 529 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
 530   return vaddq_s16(a, b);
 531 }
 532
 533 // CHECK-LABEL: @test_vaddq_s32(
 534 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
 535 // CHECK:   ret <4 x i32> [[ADD_I]]
 536 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
 537   return vaddq_s32(a, b);
 538 }
 539
 540 // CHECK-LABEL: @test_vaddq_s64(
 541 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
 542 // CHECK:   ret <2 x i64> [[ADD_I]]
 543 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
 544   return vaddq_s64(a, b);
 545 }
 546
 547 // CHECK-LABEL: @test_vaddq_f32(
 548 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
 549 // CHECK:   ret <4 x float> [[ADD_I]]
 550 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
 551   return vaddq_f32(a, b);
 552 }
 553
 554 // CHECK-LABEL: @test_vaddq_u8(
 555 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
 556 // CHECK:   ret <16 x i8> [[ADD_I]]
 557 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
 558   return vaddq_u8(a, b);
 559 }
 560
 561 // CHECK-LABEL: @test_vaddq_u16(
 562 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
 563 // CHECK:   ret <8 x i16> [[ADD_I]]
 564 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
 565   return vaddq_u16(a, b);
 566 }
 567
 568 // CHECK-LABEL: @test_vaddq_u32(
 569 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
 570 // CHECK:   ret <4 x i32> [[ADD_I]]
 571 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
 572   return vaddq_u32(a, b);
 573 }
 574
 575 // CHECK-LABEL: @test_vaddq_u64(
 576 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
 577 // CHECK:   ret <2 x i64> [[ADD_I]]
 578 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
 579   return vaddq_u64(a, b);
 580 }
 581
 582 // CHECK-LABEL: @test_vaddhn_s16(
 583 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 584 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 585 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 586 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 587 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 588 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
 589 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
 590   return vaddhn_s16(a, b);
 591 }
 592
 593 // CHECK-LABEL: @test_vaddhn_s32(
 594 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 595 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 596 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 597 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 598 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 599 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
 600 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
 601   return vaddhn_s32(a, b);
 602 }
 603
 604 // CHECK-LABEL: @test_vaddhn_s64(
 605 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 606 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 607 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 608 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 609 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 610 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
 611 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
 612   return vaddhn_s64(a, b);
 613 }
 614
 615 // CHECK-LABEL: @test_vaddhn_u16(
 616 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 617 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 618 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
 619 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 620 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
 621 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
 622 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
 623   return vaddhn_u16(a, b);
 624 }
 625
 626 // CHECK-LABEL: @test_vaddhn_u32(
 627 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 628 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 629 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
 630 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
 631 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
 632 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
 633 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
 634   return vaddhn_u32(a, b);
 635 }
 636
 637 // CHECK-LABEL: @test_vaddhn_u64(
 638 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 639 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 640 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
 641 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
 642 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
 643 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
 644 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
 645   return vaddhn_u64(a, b);
 646 }
 647
 648 // CHECK-LABEL: @test_vaddl_s8(
 649 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
 650 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 651 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 652 // CHECK:   ret <8 x i16> [[ADD_I]]
 653 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
 654   return vaddl_s8(a, b);
 655 }
 656
 657 // CHECK-LABEL: @test_vaddl_s16(
 658 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 659 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
 660 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 661 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 662 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 663 // CHECK:   ret <4 x i32> [[ADD_I]]
 664 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
 665   return vaddl_s16(a, b);
 666 }
 667
 668 // CHECK-LABEL: @test_vaddl_s32(
 669 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 670 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
 671 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 672 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 673 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 674 // CHECK:   ret <2 x i64> [[ADD_I]]
 675 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
 676   return vaddl_s32(a, b);
 677 }
 678
 679 // CHECK-LABEL: @test_vaddl_u8(
 680 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
 681 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 682 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 683 // CHECK:   ret <8 x i16> [[ADD_I]]
 684 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
 685   return vaddl_u8(a, b);
 686 }
 687
 688 // CHECK-LABEL: @test_vaddl_u16(
 689 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 690 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
 691 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 692 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 693 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 694 // CHECK:   ret <4 x i32> [[ADD_I]]
 695 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
 696   return vaddl_u16(a, b);
 697 }
 698
 699 // CHECK-LABEL: @test_vaddl_u32(
 700 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 701 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
 702 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 703 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 704 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
 705 // CHECK:   ret <2 x i64> [[ADD_I]]
 706 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
 707   return vaddl_u32(a, b);
 708 }
 709
 710 // CHECK-LABEL: @test_vaddw_s8(
 711 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
 712 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 713 // CHECK:   ret <8 x i16> [[ADD_I]]
 714 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
 715   return vaddw_s8(a, b);
 716 }
 717
 718 // CHECK-LABEL: @test_vaddw_s16(
 719 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 720 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
 721 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 722 // CHECK:   ret <4 x i32> [[ADD_I]]
 723 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
 724   return vaddw_s16(a, b);
 725 }
 726
 727 // CHECK-LABEL: @test_vaddw_s32(
 728 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 729 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
 730 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 731 // CHECK:   ret <2 x i64> [[ADD_I]]
 732 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
 733   return vaddw_s32(a, b);
 734 }
 735
 736 // CHECK-LABEL: @test_vaddw_u8(
 737 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
 738 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
 739 // CHECK:   ret <8 x i16> [[ADD_I]]
 740 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
 741   return vaddw_u8(a, b);
 742 }
 743
 744 // CHECK-LABEL: @test_vaddw_u16(
 745 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 746 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
 747 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
 748 // CHECK:   ret <4 x i32> [[ADD_I]]
 749 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
 750   return vaddw_u16(a, b);
 751 }
 752
 753 // CHECK-LABEL: @test_vaddw_u32(
 754 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 755 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
 756 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
 757 // CHECK:   ret <2 x i64> [[ADD_I]]
 758 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
 759   return vaddw_u32(a, b);
 760 }
 761
 762 // CHECK-LABEL: @test_vand_s8(
 763 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
 764 // CHECK:   ret <8 x i8> [[AND_I]]
 765 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
 766   return vand_s8(a, b);
 767 }
 768
 769 // CHECK-LABEL: @test_vand_s16(
 770 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
 771 // CHECK:   ret <4 x i16> [[AND_I]]
 772 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
 773   return vand_s16(a, b);
 774 }
 775
 776 // CHECK-LABEL: @test_vand_s32(
 777 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
 778 // CHECK:   ret <2 x i32> [[AND_I]]
 779 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
 780   return vand_s32(a, b);
 781 }
 782
 783 // CHECK-LABEL: @test_vand_s64(
 784 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
 785 // CHECK:   ret <1 x i64> [[AND_I]]
 786 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
 787   return vand_s64(a, b);
 788 }
 789
 790 // CHECK-LABEL: @test_vand_u8(
 791 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
 792 // CHECK:   ret <8 x i8> [[AND_I]]
 793 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
 794   return vand_u8(a, b);
 795 }
 796
 797 // CHECK-LABEL: @test_vand_u16(
 798 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
 799 // CHECK:   ret <4 x i16> [[AND_I]]
 800 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
 801   return vand_u16(a, b);
 802 }
 803
 804 // CHECK-LABEL: @test_vand_u32(
 805 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
 806 // CHECK:   ret <2 x i32> [[AND_I]]
 807 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
 808   return vand_u32(a, b);
 809 }
 810
 811 // CHECK-LABEL: @test_vand_u64(
 812 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
 813 // CHECK:   ret <1 x i64> [[AND_I]]
 814 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
 815   return vand_u64(a, b);
 816 }
 817
 818 // CHECK-LABEL: @test_vandq_s8(
 819 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
 820 // CHECK:   ret <16 x i8> [[AND_I]]
 821 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
 822   return vandq_s8(a, b);
 823 }
 824
 825 // CHECK-LABEL: @test_vandq_s16(
 826 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
 827 // CHECK:   ret <8 x i16> [[AND_I]]
 828 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
 829   return vandq_s16(a, b);
 830 }
 831
 832 // CHECK-LABEL: @test_vandq_s32(
 833 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
 834 // CHECK:   ret <4 x i32> [[AND_I]]
 835 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
 836   return vandq_s32(a, b);
 837 }
 838
 839 // CHECK-LABEL: @test_vandq_s64(
 840 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
 841 // CHECK:   ret <2 x i64> [[AND_I]]
 842 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
 843   return vandq_s64(a, b);
 844 }
 845
 846 // CHECK-LABEL: @test_vandq_u8(
 847 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
 848 // CHECK:   ret <16 x i8> [[AND_I]]
 849 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
 850   return vandq_u8(a, b);
 851 }
 852
 853 // CHECK-LABEL: @test_vandq_u16(
 854 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
 855 // CHECK:   ret <8 x i16> [[AND_I]]
 856 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
 857   return vandq_u16(a, b);
 858 }
 859
 860 // CHECK-LABEL: @test_vandq_u32(
 861 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
 862 // CHECK:   ret <4 x i32> [[AND_I]]
 863 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
 864   return vandq_u32(a, b);
 865 }
 866
 867 // CHECK-LABEL: @test_vandq_u64(
 868 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
 869 // CHECK:   ret <2 x i64> [[AND_I]]
 870 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
 871   return vandq_u64(a, b);
 872 }
 873
 874 // CHECK-LABEL: @test_vbic_s8(
 875 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 876 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
 877 // CHECK:   ret <8 x i8> [[AND_I]]
 878 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
 879   return vbic_s8(a, b);
 880 }
 881
 882 // CHECK-LABEL: @test_vbic_s16(
 883 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 884 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
 885 // CHECK:   ret <4 x i16> [[AND_I]]
 886 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
 887   return vbic_s16(a, b);
 888 }
 889
 890 // CHECK-LABEL: @test_vbic_s32(
 891 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 892 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
 893 // CHECK:   ret <2 x i32> [[AND_I]]
 894 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
 895   return vbic_s32(a, b);
 896 }
 897
 898 // CHECK-LABEL: @test_vbic_s64(
 899 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 900 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
 901 // CHECK:   ret <1 x i64> [[AND_I]]
 902 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
 903   return vbic_s64(a, b);
 904 }
 905
 906 // CHECK-LABEL: @test_vbic_u8(
 907 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 908 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
 909 // CHECK:   ret <8 x i8> [[AND_I]]
 910 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
 911   return vbic_u8(a, b);
 912 }
 913
 914 // CHECK-LABEL: @test_vbic_u16(
 915 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
 916 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
 917 // CHECK:   ret <4 x i16> [[AND_I]]
 918 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
 919   return vbic_u16(a, b);
 920 }
 921
 922 // CHECK-LABEL: @test_vbic_u32(
 923 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
 924 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
 925 // CHECK:   ret <2 x i32> [[AND_I]]
 926 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
 927   return vbic_u32(a, b);
 928 }
 929
 930 // CHECK-LABEL: @test_vbic_u64(
 931 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
 932 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
 933 // CHECK:   ret <1 x i64> [[AND_I]]
 934 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
 935   return vbic_u64(a, b);
 936 }
 937
 938 // CHECK-LABEL: @test_vbicq_s8(
 939 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 940 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
 941 // CHECK:   ret <16 x i8> [[AND_I]]
 942 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
 943   return vbicq_s8(a, b);
 944 }
 945
 946 // CHECK-LABEL: @test_vbicq_s16(
 947 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 948 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
 949 // CHECK:   ret <8 x i16> [[AND_I]]
 950 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
 951   return vbicq_s16(a, b);
 952 }
 953
 954 // CHECK-LABEL: @test_vbicq_s32(
 955 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 956 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
 957 // CHECK:   ret <4 x i32> [[AND_I]]
 958 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
 959   return vbicq_s32(a, b);
 960 }
 961
 962 // CHECK-LABEL: @test_vbicq_s64(
 963 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 964 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
 965 // CHECK:   ret <2 x i64> [[AND_I]]
 966 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
 967   return vbicq_s64(a, b);
 968 }
 969
 970 // CHECK-LABEL: @test_vbicq_u8(
 971 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
 972 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
 973 // CHECK:   ret <16 x i8> [[AND_I]]
 974 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
 975   return vbicq_u8(a, b);
 976 }
 977
 978 // CHECK-LABEL: @test_vbicq_u16(
 979 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 980 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
 981 // CHECK:   ret <8 x i16> [[AND_I]]
 982 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
 983   return vbicq_u16(a, b);
 984 }
 985
 986 // CHECK-LABEL: @test_vbicq_u32(
 987 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
 988 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
 989 // CHECK:   ret <4 x i32> [[AND_I]]
 990 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
 991   return vbicq_u32(a, b);
 992 }
 993
 994 // CHECK-LABEL: @test_vbicq_u64(
 995 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
 996 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
 997 // CHECK:   ret <2 x i64> [[AND_I]]
 998 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
 999   return vbicq_u64(a, b);
1000 }
1001
1002 // CHECK-LABEL: @test_vbsl_s8(
1003 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1004 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
1005 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
1006   return vbsl_s8(a, b, c);
1007 }
1008
1009 // CHECK-LABEL: @test_vbsl_s16(
1010 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1011 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1012 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1013 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1014 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1015 // CHECK:   ret <4 x i16> [[TMP3]]
1016 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
1017   return vbsl_s16(a, b, c);
1018 }
1019
1020 // CHECK-LABEL: @test_vbsl_s32(
1021 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1022 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1023 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1024 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1025 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1026 // CHECK:   ret <2 x i32> [[TMP3]]
1027 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
1028   return vbsl_s32(a, b, c);
1029 }
1030
1031 // CHECK-LABEL: @test_vbsl_s64(
1032 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1033 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1034 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1035 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1036 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1037 // CHECK:   ret <1 x i64> [[TMP3]]
1038 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
1039   return vbsl_s64(a, b, c);
1040 }
1041
1042 // CHECK-LABEL: @test_vbsl_u8(
1043 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1044 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
1045 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
1046   return vbsl_u8(a, b, c);
1047 }
1048
1049 // CHECK-LABEL: @test_vbsl_u16(
1050 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1051 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1052 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1053 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1054 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1055 // CHECK:   ret <4 x i16> [[TMP3]]
1056 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
1057   return vbsl_u16(a, b, c);
1058 }
1059
1060 // CHECK-LABEL: @test_vbsl_u32(
1061 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1062 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1063 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1064 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1065 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1066 // CHECK:   ret <2 x i32> [[TMP3]]
1067 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
1068   return vbsl_u32(a, b, c);
1069 }
1070
1071 // CHECK-LABEL: @test_vbsl_u64(
1072 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1073 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1074 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1075 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1076 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1077 // CHECK:   ret <1 x i64> [[TMP3]]
1078 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
1079   return vbsl_u64(a, b, c);
1080 }
1081
1082 // CHECK-LABEL: @test_vbsl_f32(
1083 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1084 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1085 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
1086 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1087 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
1088 // CHECK:   ret <2 x float> [[TMP3]]
1089 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
1090   return vbsl_f32(a, b, c);
1091 }
1092
1093 // CHECK-LABEL: @test_vbsl_p8(
1094 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1095 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
1096 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
1097   return vbsl_p8(a, b, c);
1098 }
1099
1100 // CHECK-LABEL: @test_vbsl_p16(
1101 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1102 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1103 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1104 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1105 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1106 // CHECK:   ret <4 x i16> [[TMP3]]
1107 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
1108   return vbsl_p16(a, b, c);
1109 }
1110
1111 // CHECK-LABEL: @test_vbslq_s8(
1112 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1113 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1114 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
1115   return vbslq_s8(a, b, c);
1116 }
1117
1118 // CHECK-LABEL: @test_vbslq_s16(
1119 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1120 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1121 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1122 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1123 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1124 // CHECK:   ret <8 x i16> [[TMP3]]
1125 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
1126   return vbslq_s16(a, b, c);
1127 }
1128
1129 // CHECK-LABEL: @test_vbslq_s32(
1130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1131 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1132 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1133 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1134 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1135 // CHECK:   ret <4 x i32> [[TMP3]]
1136 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
1137   return vbslq_s32(a, b, c);
1138 }
1139
1140 // CHECK-LABEL: @test_vbslq_s64(
1141 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1142 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1143 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1144 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1145 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1146 // CHECK:   ret <2 x i64> [[TMP3]]
1147 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
1148   return vbslq_s64(a, b, c);
1149 }
1150
1151 // CHECK-LABEL: @test_vbslq_u8(
1152 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1153 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1154 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
1155   return vbslq_u8(a, b, c);
1156 }
1157
1158 // CHECK-LABEL: @test_vbslq_u16(
1159 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1160 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1161 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1162 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1163 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1164 // CHECK:   ret <8 x i16> [[TMP3]]
1165 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
1166   return vbslq_u16(a, b, c);
1167 }
1168
1169 // CHECK-LABEL: @test_vbslq_u32(
1170 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1171 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1172 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1173 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1174 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1175 // CHECK:   ret <4 x i32> [[TMP3]]
1176 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
1177   return vbslq_u32(a, b, c);
1178 }
1179
1180 // CHECK-LABEL: @test_vbslq_u64(
1181 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1182 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1183 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1184 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1185 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1186 // CHECK:   ret <2 x i64> [[TMP3]]
1187 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
1188   return vbslq_u64(a, b, c);
1189 }
1190
1191 // CHECK-LABEL: @test_vbslq_f32(
1192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1193 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1194 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
1195 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1196 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
1197 // CHECK:   ret <4 x float> [[TMP3]]
1198 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
1199   return vbslq_f32(a, b, c);
1200 }
1201
1202 // CHECK-LABEL: @test_vbslq_p8(
1203 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1204 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1205 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
1206   return vbslq_p8(a, b, c);
1207 }
1208
1209 // CHECK-LABEL: @test_vbslq_p16(
1210 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1211 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1212 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1213 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1214 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1215 // CHECK:   ret <8 x i16> [[TMP3]]
1216 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
1217   return vbslq_p16(a, b, c);
1218 }
1219
1220 // CHECK-LABEL: @test_vcage_f32(
1221 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1222 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1223 // CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
1224 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
1225 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
1226   return vcage_f32(a, b);
1227 }
1228
1229 // CHECK-LABEL: @test_vcageq_f32(
1230 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1231 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1232 // CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
1233 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
1234 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
1235   return vcageq_f32(a, b);
1236 }
1237
1238 // CHECK-LABEL: @test_vcagt_f32(
1239 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1240 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1241 // CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
1242 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
1243 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
1244   return vcagt_f32(a, b);
1245 }
1246
1247 // CHECK-LABEL: @test_vcagtq_f32(
1248 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1249 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1250 // CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
1251 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
1252 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
1253   return vcagtq_f32(a, b);
1254 }
1255
1256 // CHECK-LABEL: @test_vcale_f32(
1257 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1258 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1259 // CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
1260 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
1261 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
1262   return vcale_f32(a, b);
1263 }
1264
1265 // CHECK-LABEL: @test_vcaleq_f32(
1266 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1267 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1268 // CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
1269 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
1270 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
1271   return vcaleq_f32(a, b);
1272 }
1273
1274 // CHECK-LABEL: @test_vcalt_f32(
1275 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1276 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1277 // CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
1278 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
1279 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
1280   return vcalt_f32(a, b);
1281 }
1282
1283 // CHECK-LABEL: @test_vcaltq_f32(
1284 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1285 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1286 // CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
1287 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
1288 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
1289   return vcaltq_f32(a, b);
1290 }
1291
1292 // CHECK-LABEL: @test_vceq_s8(
1293 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1294 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1295 // CHECK:   ret <8 x i8> [[SEXT_I]]
1296 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
1297   return vceq_s8(a, b);
1298 }
1299
1300 // CHECK-LABEL: @test_vceq_s16(
1301 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1302 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1303 // CHECK:   ret <4 x i16> [[SEXT_I]]
1304 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
1305   return vceq_s16(a, b);
1306 }
1307
1308 // CHECK-LABEL: @test_vceq_s32(
1309 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1310 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1311 // CHECK:   ret <2 x i32> [[SEXT_I]]
1312 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
1313   return vceq_s32(a, b);
1314 }
1315
1316 // CHECK-LABEL: @test_vceq_f32(
1317 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
1318 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1319 // CHECK:   ret <2 x i32> [[SEXT_I]]
1320 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
1321   return vceq_f32(a, b);
1322 }
1323
1324 // CHECK-LABEL: @test_vceq_u8(
1325 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1326 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1327 // CHECK:   ret <8 x i8> [[SEXT_I]]
1328 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
1329   return vceq_u8(a, b);
1330 }
1331
1332 // CHECK-LABEL: @test_vceq_u16(
1333 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1334 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1335 // CHECK:   ret <4 x i16> [[SEXT_I]]
1336 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
1337   return vceq_u16(a, b);
1338 }
1339
1340 // CHECK-LABEL: @test_vceq_u32(
1341 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1342 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1343 // CHECK:   ret <2 x i32> [[SEXT_I]]
1344 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
1345   return vceq_u32(a, b);
1346 }
1347
1348 // CHECK-LABEL: @test_vceq_p8(
1349 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1350 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1351 // CHECK:   ret <8 x i8> [[SEXT_I]]
1352 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
1353   return vceq_p8(a, b);
1354 }
1355
1356 // CHECK-LABEL: @test_vceqq_s8(
1357 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1358 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1359 // CHECK:   ret <16 x i8> [[SEXT_I]]
1360 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
1361   return vceqq_s8(a, b);
1362 }
1363
1364 // CHECK-LABEL: @test_vceqq_s16(
1365 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1366 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1367 // CHECK:   ret <8 x i16> [[SEXT_I]]
1368 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
1369   return vceqq_s16(a, b);
1370 }
1371
1372 // CHECK-LABEL: @test_vceqq_s32(
1373 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1374 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1375 // CHECK:   ret <4 x i32> [[SEXT_I]]
1376 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
1377   return vceqq_s32(a, b);
1378 }
1379
1380 // CHECK-LABEL: @test_vceqq_f32(
1381 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
1382 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1383 // CHECK:   ret <4 x i32> [[SEXT_I]]
1384 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
1385   return vceqq_f32(a, b);
1386 }
1387
1388 // CHECK-LABEL: @test_vceqq_u8(
1389 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1390 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1391 // CHECK:   ret <16 x i8> [[SEXT_I]]
1392 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
1393   return vceqq_u8(a, b);
1394 }
1395
1396 // CHECK-LABEL: @test_vceqq_u16(
1397 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1398 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1399 // CHECK:   ret <8 x i16> [[SEXT_I]]
1400 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
1401   return vceqq_u16(a, b);
1402 }
1403
1404 // CHECK-LABEL: @test_vceqq_u32(
1405 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1406 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1407 // CHECK:   ret <4 x i32> [[SEXT_I]]
1408 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
1409   return vceqq_u32(a, b);
1410 }
1411
1412 // CHECK-LABEL: @test_vceqq_p8(
1413 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1414 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1415 // CHECK:   ret <16 x i8> [[SEXT_I]]
1416 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
1417   return vceqq_p8(a, b);
1418 }
1419
1420 // CHECK-LABEL: @test_vcge_s8(
1421 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
1422 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1423 // CHECK:   ret <8 x i8> [[SEXT_I]]
1424 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
1425   return vcge_s8(a, b);
1426 }
1427
1428 // CHECK-LABEL: @test_vcge_s16(
1429 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
1430 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1431 // CHECK:   ret <4 x i16> [[SEXT_I]]
1432 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
1433   return vcge_s16(a, b);
1434 }
1435
1436 // CHECK-LABEL: @test_vcge_s32(
1437 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
1438 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1439 // CHECK:   ret <2 x i32> [[SEXT_I]]
1440 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
1441   return vcge_s32(a, b);
1442 }
1443
1444 // CHECK-LABEL: @test_vcge_f32(
1445 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
1446 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1447 // CHECK:   ret <2 x i32> [[SEXT_I]]
1448 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
1449   return vcge_f32(a, b);
1450 }
1451
1452 // CHECK-LABEL: @test_vcge_u8(
1453 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
1454 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1455 // CHECK:   ret <8 x i8> [[SEXT_I]]
1456 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
1457   return vcge_u8(a, b);
1458 }
1459
1460 // CHECK-LABEL: @test_vcge_u16(
1461 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
1462 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1463 // CHECK:   ret <4 x i16> [[SEXT_I]]
1464 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
1465   return vcge_u16(a, b);
1466 }
1467
1468 // CHECK-LABEL: @test_vcge_u32(
1469 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
1470 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1471 // CHECK:   ret <2 x i32> [[SEXT_I]]
1472 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
1473   return vcge_u32(a, b);
1474 }
1475
1476 // CHECK-LABEL: @test_vcgeq_s8(
1477 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
1478 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1479 // CHECK:   ret <16 x i8> [[SEXT_I]]
1480 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
1481   return vcgeq_s8(a, b);
1482 }
1483
1484 // CHECK-LABEL: @test_vcgeq_s16(
1485 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
1486 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1487 // CHECK:   ret <8 x i16> [[SEXT_I]]
1488 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
1489   return vcgeq_s16(a, b);
1490 }
1491
1492 // CHECK-LABEL: @test_vcgeq_s32(
1493 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
1494 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1495 // CHECK:   ret <4 x i32> [[SEXT_I]]
1496 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
1497   return vcgeq_s32(a, b);
1498 }
1499
1500 // CHECK-LABEL: @test_vcgeq_f32(
1501 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
1502 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1503 // CHECK:   ret <4 x i32> [[SEXT_I]]
1504 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
1505   return vcgeq_f32(a, b);
1506 }
1507
1508 // CHECK-LABEL: @test_vcgeq_u8(
1509 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
1510 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1511 // CHECK:   ret <16 x i8> [[SEXT_I]]
1512 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
1513   return vcgeq_u8(a, b);
1514 }
1515
1516 // CHECK-LABEL: @test_vcgeq_u16(
1517 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
1518 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1519 // CHECK:   ret <8 x i16> [[SEXT_I]]
1520 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
1521   return vcgeq_u16(a, b);
1522 }
1523
1524 // CHECK-LABEL: @test_vcgeq_u32(
1525 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
1526 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1527 // CHECK:   ret <4 x i32> [[SEXT_I]]
1528 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
1529   return vcgeq_u32(a, b);
1530 }
1531
1532 // CHECK-LABEL: @test_vcgt_s8(
1533 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
1534 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1535 // CHECK:   ret <8 x i8> [[SEXT_I]]
1536 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
1537   return vcgt_s8(a, b);
1538 }
1539
1540 // CHECK-LABEL: @test_vcgt_s16(
1541 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
1542 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1543 // CHECK:   ret <4 x i16> [[SEXT_I]]
1544 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
1545   return vcgt_s16(a, b);
1546 }
1547
1548 // CHECK-LABEL: @test_vcgt_s32(
1549 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
1550 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1551 // CHECK:   ret <2 x i32> [[SEXT_I]]
1552 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
1553   return vcgt_s32(a, b);
1554 }
1555
1556 // CHECK-LABEL: @test_vcgt_f32(
1557 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
1558 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1559 // CHECK:   ret <2 x i32> [[SEXT_I]]
1560 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
1561   return vcgt_f32(a, b);
1562 }
1563
1564 // CHECK-LABEL: @test_vcgt_u8(
1565 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
1566 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1567 // CHECK:   ret <8 x i8> [[SEXT_I]]
1568 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
1569   return vcgt_u8(a, b);
1570 }
1571
1572 // CHECK-LABEL: @test_vcgt_u16(
1573 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
1574 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1575 // CHECK:   ret <4 x i16> [[SEXT_I]]
1576 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
1577   return vcgt_u16(a, b);
1578 }
1579
1580 // CHECK-LABEL: @test_vcgt_u32(
1581 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
1582 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1583 // CHECK:   ret <2 x i32> [[SEXT_I]]
1584 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
1585   return vcgt_u32(a, b);
1586 }
1587
1588 // CHECK-LABEL: @test_vcgtq_s8(
1589 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
1590 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1591 // CHECK:   ret <16 x i8> [[SEXT_I]]
1592 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
1593   return vcgtq_s8(a, b);
1594 }
1595
1596 // CHECK-LABEL: @test_vcgtq_s16(
1597 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
1598 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1599 // CHECK:   ret <8 x i16> [[SEXT_I]]
1600 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
1601   return vcgtq_s16(a, b);
1602 }
1603
1604 // CHECK-LABEL: @test_vcgtq_s32(
1605 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
1606 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1607 // CHECK:   ret <4 x i32> [[SEXT_I]]
1608 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
1609   return vcgtq_s32(a, b);
1610 }
1611
1612 // CHECK-LABEL: @test_vcgtq_f32(
1613 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
1614 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1615 // CHECK:   ret <4 x i32> [[SEXT_I]]
1616 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
1617   return vcgtq_f32(a, b);
1618 }
1619
1620 // CHECK-LABEL: @test_vcgtq_u8(
1621 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
1622 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1623 // CHECK:   ret <16 x i8> [[SEXT_I]]
1624 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
1625   return vcgtq_u8(a, b);
1626 }
1627
1628 // CHECK-LABEL: @test_vcgtq_u16(
1629 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
1630 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1631 // CHECK:   ret <8 x i16> [[SEXT_I]]
1632 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
1633   return vcgtq_u16(a, b);
1634 }
1635
1636 // CHECK-LABEL: @test_vcgtq_u32(
1637 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
1638 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1639 // CHECK:   ret <4 x i32> [[SEXT_I]]
1640 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
1641   return vcgtq_u32(a, b);
1642 }
1643
1644 // CHECK-LABEL: @test_vcle_s8(
1645 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
1646 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1647 // CHECK:   ret <8 x i8> [[SEXT_I]]
1648 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
1649   return vcle_s8(a, b);
1650 }
1651
1652 // CHECK-LABEL: @test_vcle_s16(
1653 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
1654 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1655 // CHECK:   ret <4 x i16> [[SEXT_I]]
1656 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
1657   return vcle_s16(a, b);
1658 }
1659
1660 // CHECK-LABEL: @test_vcle_s32(
1661 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
1662 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1663 // CHECK:   ret <2 x i32> [[SEXT_I]]
1664 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
1665   return vcle_s32(a, b);
1666 }
1667
1668 // CHECK-LABEL: @test_vcle_f32(
1669 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
1670 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1671 // CHECK:   ret <2 x i32> [[SEXT_I]]
1672 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
1673   return vcle_f32(a, b);
1674 }
1675
1676 // CHECK-LABEL: @test_vcle_u8(
1677 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
1678 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1679 // CHECK:   ret <8 x i8> [[SEXT_I]]
1680 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
1681   return vcle_u8(a, b);
1682 }
1683
1684 // CHECK-LABEL: @test_vcle_u16(
1685 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
1686 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1687 // CHECK:   ret <4 x i16> [[SEXT_I]]
1688 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
1689   return vcle_u16(a, b);
1690 }
1691
1692 // CHECK-LABEL: @test_vcle_u32(
1693 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
1694 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1695 // CHECK:   ret <2 x i32> [[SEXT_I]]
1696 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
1697   return vcle_u32(a, b);
1698 }
1699
1700 // CHECK-LABEL: @test_vcleq_s8(
1701 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
1702 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1703 // CHECK:   ret <16 x i8> [[SEXT_I]]
1704 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
1705   return vcleq_s8(a, b);
1706 }
1707
1708 // CHECK-LABEL: @test_vcleq_s16(
1709 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
1710 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1711 // CHECK:   ret <8 x i16> [[SEXT_I]]
1712 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
1713   return vcleq_s16(a, b);
1714 }
1715
1716 // CHECK-LABEL: @test_vcleq_s32(
1717 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
1718 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1719 // CHECK:   ret <4 x i32> [[SEXT_I]]
1720 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
1721   return vcleq_s32(a, b);
1722 }
1723
1724 // CHECK-LABEL: @test_vcleq_f32(
1725 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
1726 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1727 // CHECK:   ret <4 x i32> [[SEXT_I]]
1728 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
1729   return vcleq_f32(a, b);
1730 }
1731
1732 // CHECK-LABEL: @test_vcleq_u8(
1733 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
1734 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1735 // CHECK:   ret <16 x i8> [[SEXT_I]]
1736 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
1737   return vcleq_u8(a, b);
1738 }
1739
1740 // CHECK-LABEL: @test_vcleq_u16(
1741 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
1742 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1743 // CHECK:   ret <8 x i16> [[SEXT_I]]
1744 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
1745   return vcleq_u16(a, b);
1746 }
1747
1748 // CHECK-LABEL: @test_vcleq_u32(
1749 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
1750 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1751 // CHECK:   ret <4 x i32> [[SEXT_I]]
1752 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
1753   return vcleq_u32(a, b);
1754 }
1755
1756 // CHECK-LABEL: @test_vcls_s8(
1757 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
1758 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
1759 int8x8_t test_vcls_s8(int8x8_t a) {
1760   return vcls_s8(a);
1761 }
1762
1763 // CHECK-LABEL: @test_vcls_s16(
1764 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1765 // CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
1766 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
1767 // CHECK:   ret <4 x i16> [[VCLS_V1_I]]
1768 int16x4_t test_vcls_s16(int16x4_t a) {
1769   return vcls_s16(a);
1770 }
1771
1772 // CHECK-LABEL: @test_vcls_s32(
1773 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1774 // CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
1775 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
1776 // CHECK:   ret <2 x i32> [[VCLS_V1_I]]
1777 int32x2_t test_vcls_s32(int32x2_t a) {
1778   return vcls_s32(a);
1779 }
1780
1781 // CHECK-LABEL: @test_vcls_u8(
1782 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
1783 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
1784 int8x8_t test_vcls_u8(uint8x8_t a) {
1785   return vcls_u8(a);
1786 }
1787
1788 // CHECK-LABEL: @test_vcls_u16(
1789 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1790 // CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
1791 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
1792 // CHECK:   ret <4 x i16> [[VCLS_V1_I]]
1793 int16x4_t test_vcls_u16(uint16x4_t a) {
1794   return vcls_u16(a);
1795 }
1796
1797 // CHECK-LABEL: @test_vcls_u32(
1798 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1799 // CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
1800 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
1801 // CHECK:   ret <2 x i32> [[VCLS_V1_I]]
1802 int32x2_t test_vcls_u32(uint32x2_t a) {
1803   return vcls_u32(a);
1804 }
1805
1806 // CHECK-LABEL: @test_vclsq_s8(
1807 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
1808 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
1809 int8x16_t test_vclsq_s8(int8x16_t a) {
1810   return vclsq_s8(a);
1811 }
1812
1813 // CHECK-LABEL: @test_vclsq_s16(
1814 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1815 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
1816 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
1817 // CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
1818 int16x8_t test_vclsq_s16(int16x8_t a) {
1819   return vclsq_s16(a);
1820 }
1821
1822 // CHECK-LABEL: @test_vclsq_s32(
1823 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1824 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
1825 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
1826 // CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
1827 int32x4_t test_vclsq_s32(int32x4_t a) {
1828   return vclsq_s32(a);
1829 }
1830
1831 // CHECK-LABEL: @test_vclsq_u8(
1832 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
1833 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
1834 int8x16_t test_vclsq_u8(uint8x16_t a) {
1835   return vclsq_u8(a);
1836 }
1837
1838 // CHECK-LABEL: @test_vclsq_u16(
1839 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1840 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
1841 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
1842 // CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
1843 int16x8_t test_vclsq_u16(uint16x8_t a) {
1844   return vclsq_u16(a);
1845 }
1846
1847 // CHECK-LABEL: @test_vclsq_u32(
1848 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1849 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
1850 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
1851 // CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
1852 int32x4_t test_vclsq_u32(uint32x4_t a) {
1853   return vclsq_u32(a);
1854 }
1855
1856 // CHECK-LABEL: @test_vclt_s8(
1857 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
1858 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1859 // CHECK:   ret <8 x i8> [[SEXT_I]]
1860 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
1861   return vclt_s8(a, b);
1862 }
1863
1864 // CHECK-LABEL: @test_vclt_s16(
1865 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
1866 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1867 // CHECK:   ret <4 x i16> [[SEXT_I]]
1868 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
1869   return vclt_s16(a, b);
1870 }
1871
1872 // CHECK-LABEL: @test_vclt_s32(
1873 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
1874 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1875 // CHECK:   ret <2 x i32> [[SEXT_I]]
1876 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
1877   return vclt_s32(a, b);
1878 }
1879
1880 // CHECK-LABEL: @test_vclt_f32(
1881 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
1882 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1883 // CHECK:   ret <2 x i32> [[SEXT_I]]
1884 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
1885   return vclt_f32(a, b);
1886 }
1887
1888 // CHECK-LABEL: @test_vclt_u8(
1889 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
1890 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1891 // CHECK:   ret <8 x i8> [[SEXT_I]]
1892 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
1893   return vclt_u8(a, b);
1894 }
1895
1896 // CHECK-LABEL: @test_vclt_u16(
1897 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
1898 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1899 // CHECK:   ret <4 x i16> [[SEXT_I]]
1900 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
1901   return vclt_u16(a, b);
1902 }
1903
1904 // CHECK-LABEL: @test_vclt_u32(
1905 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
1906 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1907 // CHECK:   ret <2 x i32> [[SEXT_I]]
1908 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
1909   return vclt_u32(a, b);
1910 }
1911
1912 // CHECK-LABEL: @test_vcltq_s8(
1913 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
1914 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1915 // CHECK:   ret <16 x i8> [[SEXT_I]]
1916 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
1917   return vcltq_s8(a, b);
1918 }
1919
1920 // CHECK-LABEL: @test_vcltq_s16(
1921 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
1922 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1923 // CHECK:   ret <8 x i16> [[SEXT_I]]
1924 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
1925   return vcltq_s16(a, b);
1926 }
1927
1928 // CHECK-LABEL: @test_vcltq_s32(
1929 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
1930 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1931 // CHECK:   ret <4 x i32> [[SEXT_I]]
1932 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
1933   return vcltq_s32(a, b);
1934 }
1935
1936 // CHECK-LABEL: @test_vcltq_f32(
1937 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
1938 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1939 // CHECK:   ret <4 x i32> [[SEXT_I]]
1940 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
1941   return vcltq_f32(a, b);
1942 }
1943
1944 // CHECK-LABEL: @test_vcltq_u8(
1945 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
1946 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1947 // CHECK:   ret <16 x i8> [[SEXT_I]]
1948 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
1949   return vcltq_u8(a, b);
1950 }
1951
1952 // CHECK-LABEL: @test_vcltq_u16(
1953 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
1954 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1955 // CHECK:   ret <8 x i16> [[SEXT_I]]
1956 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
1957   return vcltq_u16(a, b);
1958 }
1959
1960 // CHECK-LABEL: @test_vcltq_u32(
1961 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
1962 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1963 // CHECK:   ret <4 x i32> [[SEXT_I]]
1964 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
1965   return vcltq_u32(a, b);
1966 }
1967
1968 // CHECK-LABEL: @test_vclz_s8(
1969 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
1970 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
1971 int8x8_t test_vclz_s8(int8x8_t a) {
1972   return vclz_s8(a);
1973 }
1974
1975 // CHECK-LABEL: @test_vclz_s16(
1976 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1977 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
1978 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
1979 // CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
1980 int16x4_t test_vclz_s16(int16x4_t a) {
1981   return vclz_s16(a);
1982 }
1983
1984 // CHECK-LABEL: @test_vclz_s32(
1985 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1986 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
1987 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
1988 // CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
1989 int32x2_t test_vclz_s32(int32x2_t a) {
1990   return vclz_s32(a);
1991 }
1992
1993 // CHECK-LABEL: @test_vclz_u8(
1994 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
1995 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
1996 uint8x8_t test_vclz_u8(uint8x8_t a) {
1997   return vclz_u8(a);
1998 }
1999
2000 // CHECK-LABEL: @test_vclz_u16(
2001 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2002 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
2003 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2004 // CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
2005 uint16x4_t test_vclz_u16(uint16x4_t a) {
2006   return vclz_u16(a);
2007 }
2008
2009 // CHECK-LABEL: @test_vclz_u32(
2010 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2011 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
2012 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2013 // CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
2014 uint32x2_t test_vclz_u32(uint32x2_t a) {
2015   return vclz_u32(a);
2016 }
2017
2018 // CHECK-LABEL: @test_vclzq_s8(
2019 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
2020 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
2021 int8x16_t test_vclzq_s8(int8x16_t a) {
2022   return vclzq_s8(a);
2023 }
2024
2025 // CHECK-LABEL: @test_vclzq_s16(
2026 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2027 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
2028 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2029 // CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
2030 int16x8_t test_vclzq_s16(int16x8_t a) {
2031   return vclzq_s16(a);
2032 }
2033
2034 // CHECK-LABEL: @test_vclzq_s32(
2035 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2036 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
2037 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2038 // CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
2039 int32x4_t test_vclzq_s32(int32x4_t a) {
2040   return vclzq_s32(a);
2041 }
2042
2043 // CHECK-LABEL: @test_vclzq_u8(
2044 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
2045 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
2046 uint8x16_t test_vclzq_u8(uint8x16_t a) {
2047   return vclzq_u8(a);
2048 }
2049
2050 // CHECK-LABEL: @test_vclzq_u16(
2051 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2052 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
2053 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2054 // CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
2055 uint16x8_t test_vclzq_u16(uint16x8_t a) {
2056   return vclzq_u16(a);
2057 }
2058
2059 // CHECK-LABEL: @test_vclzq_u32(
2060 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2061 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
2062 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2063 // CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
2064 uint32x4_t test_vclzq_u32(uint32x4_t a) {
2065   return vclzq_u32(a);
2066 }
2067
2068 // CHECK-LABEL: @test_vcnt_u8(
2069 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2070 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
2071 uint8x8_t test_vcnt_u8(uint8x8_t a) {
2072   return vcnt_u8(a);
2073 }
2074
2075 // CHECK-LABEL: @test_vcnt_s8(
2076 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2077 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
2078 int8x8_t test_vcnt_s8(int8x8_t a) {
2079   return vcnt_s8(a);
2080 }
2081
2082 // CHECK-LABEL: @test_vcnt_p8(
2083 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2084 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
2085 poly8x8_t test_vcnt_p8(poly8x8_t a) {
2086   return vcnt_p8(a);
2087 }
2088
2089 // CHECK-LABEL: @test_vcntq_u8(
2090 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2091 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2092 uint8x16_t test_vcntq_u8(uint8x16_t a) {
2093   return vcntq_u8(a);
2094 }
2095
2096 // CHECK-LABEL: @test_vcntq_s8(
2097 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2098 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2099 int8x16_t test_vcntq_s8(int8x16_t a) {
2100   return vcntq_s8(a);
2101 }
2102
2103 // CHECK-LABEL: @test_vcntq_p8(
2104 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2105 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2106 poly8x16_t test_vcntq_p8(poly8x16_t a) {
2107   return vcntq_p8(a);
2108 }
2109
2110 // CHECK-LABEL: @test_vcombine_s8(
2111 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2112 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2113 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
2114   return vcombine_s8(a, b);
2115 }
2116
2117 // CHECK-LABEL: @test_vcombine_s16(
2118 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2119 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2120 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
2121   return vcombine_s16(a, b);
2122 }
2123
2124 // CHECK-LABEL: @test_vcombine_s32(
2125 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2126 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
2127 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
2128   return vcombine_s32(a, b);
2129 }
2130
2131 // CHECK-LABEL: @test_vcombine_s64(
2132 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2133 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
2134 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
2135   return vcombine_s64(a, b);
2136 }
2137
2138 // CHECK-LABEL: @test_vcombine_f16(
2139 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2140 // CHECK:   ret <8 x half> [[SHUFFLE_I]]
2141 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
2142   return vcombine_f16(a, b);
2143 }
2144
2145 // CHECK-LABEL: @test_vcombine_f32(
2146 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2147 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
2148 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
2149   return vcombine_f32(a, b);
2150 }
2151
2152 // CHECK-LABEL: @test_vcombine_u8(
2153 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2154 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2155 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
2156   return vcombine_u8(a, b);
2157 }
2158
2159 // CHECK-LABEL: @test_vcombine_u16(
2160 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2161 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2162 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
2163   return vcombine_u16(a, b);
2164 }
2165
2166 // CHECK-LABEL: @test_vcombine_u32(
2167 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2168 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
2169 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
2170   return vcombine_u32(a, b);
2171 }
2172
2173 // CHECK-LABEL: @test_vcombine_u64(
2174 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2175 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
2176 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
2177   return vcombine_u64(a, b);
2178 }
2179
2180 // CHECK-LABEL: @test_vcombine_p8(
2181 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2182 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2183 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
2184   return vcombine_p8(a, b);
2185 }
2186
2187 // CHECK-LABEL: @test_vcombine_p16(
2188 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2189 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2190 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
2191   return vcombine_p16(a, b);
2192 }
2193
2194 // CHECK-LABEL: @test_vcreate_s8(
2195 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2196 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
2197 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2198 int8x8_t test_vcreate_s8(uint64_t a) {
2199   return vclz_s8(vcreate_s8(a));
2200 }
2201
2202 // CHECK-LABEL: @test_vcreate_imm
2203 // CHECK: [[RES:%.*]] = bitcast i64 0 to <4 x i16>
2204 // CHECK: ret <4 x i16> [[RES]]
2205 int16x4_t test_vcreate_imm(void) {
2206   return vcreate_s16(0);
2207 }
2208
2209 // CHECK-LABEL: @test_vcreate_s16(
2210 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2211 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2212 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
2213 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2214 // CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
2215 int16x4_t test_vcreate_s16(uint64_t a) {
2216   return vclz_s16(vcreate_s16(a));
2217 }
2218
2219 // CHECK-LABEL: @test_vcreate_s32(
2220 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2221 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2222 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
2223 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2224 // CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
2225 int32x2_t test_vcreate_s32(uint64_t a) {
2226   return vclz_s32(vcreate_s32(a));
2227 }
2228
2229 // CHECK-LABEL: @test_vcreate_f16(
2230 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
2231 // CHECK:   ret <4 x half> [[TMP0]]
2232 float16x4_t test_vcreate_f16(uint64_t a) {
2233   return vcreate_f16(a);
2234 }
2235
2236 // CHECK-LABEL: @test_vcreate_f32(
2237 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
2238 // CHECK:   ret <2 x float> [[TMP0]]
2239 float32x2_t test_vcreate_f32(uint64_t a) {
2240   return vcreate_f32(a);
2241 }
2242
2243 // CHECK-LABEL: @test_vcreate_u8(
2244 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2245 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
2246 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2247 int8x8_t test_vcreate_u8(uint64_t a) {
2248   return vclz_s8((int8x8_t)vcreate_u8(a));
2249 }
2250
2251 // CHECK-LABEL: @test_vcreate_u16(
2252 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2253 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2254 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
2255 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2256 // CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
2257 int16x4_t test_vcreate_u16(uint64_t a) {
2258   return vclz_s16((int16x4_t)vcreate_u16(a));
2259 }
2260
2261 // CHECK-LABEL: @test_vcreate_u32(
2262 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2263 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2264 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
2265 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2266 // CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
2267 int32x2_t test_vcreate_u32(uint64_t a) {
2268   return vclz_s32((int32x2_t)vcreate_u32(a));
2269 }
2270
2271 // CHECK-LABEL: @test_vcreate_u64(
2272 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2273 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2274 // CHECK:   ret <1 x i64> [[ADD_I]]
2275 uint64x1_t test_vcreate_u64(uint64_t a) {
2276   uint64x1_t tmp = vcreate_u64(a);
2277   return vadd_u64(tmp, tmp);
2278 }
2279
2280 // CHECK-LABEL: @test_vcreate_p8(
2281 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2282 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
2283 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
2284 poly8x8_t test_vcreate_p8(uint64_t a) {
2285   return vcnt_p8(vcreate_p8(a));
2286 }
2287
2288 // CHECK-LABEL: @test_vcreate_p16(
2289 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2290 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2291 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2292 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2293 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
2294 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
2295 // CHECK:   ret <4 x i16> [[TMP4]]
2296 poly16x4_t test_vcreate_p16(uint64_t a) {
2297   poly16x4_t tmp = vcreate_p16(a);
2298   return vbsl_p16((uint16x4_t)tmp, tmp, tmp);
2299 }
2300
2301 // CHECK-LABEL: @test_vcreate_s64(
2302 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2303 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2304 // CHECK:   ret <1 x i64> [[ADD_I]]
2305 int64x1_t test_vcreate_s64(uint64_t a) {
2306   int64x1_t tmp = vcreate_s64(a);
2307   return vadd_s64(tmp, tmp);
2308 }
2309
2310 // CHECK-LABEL: @test_vcvt_f16_f32(
2311 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2312 // CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a)
2313 // CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
2314 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
2315 // CHECK:   ret <4 x half> [[TMP1]]
2316 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
2317   return vcvt_f16_f32(a);
2318 }
2319
2320 // CHECK-LABEL: @test_vcvt_f32_s32(
2321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2322 // CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
2323 // CHECK:   ret <2 x float> [[VCVT_I]]
2324 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
2325   return vcvt_f32_s32(a);
2326 }
2327
2328 // CHECK-LABEL: @test_vcvt_f32_u32(
2329 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2330 // CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
2331 // CHECK:   ret <2 x float> [[VCVT_I]]
2332 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
2333   return vcvt_f32_u32(a);
2334 }
2335
2336 // CHECK-LABEL: @test_vcvtq_f32_s32(
2337 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2338 // CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
2339 // CHECK:   ret <4 x float> [[VCVT_I]]
2340 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
2341   return vcvtq_f32_s32(a);
2342 }
2343
2344 // CHECK-LABEL: @test_vcvtq_f32_u32(
2345 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2346 // CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
2347 // CHECK:   ret <4 x float> [[VCVT_I]]
2348 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
2349   return vcvtq_f32_u32(a);
2350 }
2351
2352 // CHECK-LABEL: @test_vcvt_f32_f16(
2353 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
2354 // CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2355 // CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
2356 // CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
2357 // CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
2358 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
2359   return vcvt_f32_f16(a);
2360 }
2361
2362 // CHECK-LABEL: @test_vcvt_n_f32_s32(
2363 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2364 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2365 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2366 // CHECK:   ret <2 x float> [[VCVT_N1]]
2367 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
2368   return vcvt_n_f32_s32(a, 1);
2369 }
2370
2371 // CHECK-LABEL: @test_vcvt_n_f32_u32(
2372 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2373 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2374 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2375 // CHECK:   ret <2 x float> [[VCVT_N1]]
2376 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
2377   return vcvt_n_f32_u32(a, 1);
2378 }
2379
2380 // CHECK-LABEL: @test_vcvtq_n_f32_s32(
2381 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2382 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2383 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2384 // CHECK:   ret <4 x float> [[VCVT_N1]]
2385 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
2386   return vcvtq_n_f32_s32(a, 3);
2387 }
2388
2389 // CHECK-LABEL: @test_vcvtq_n_f32_u32(
2390 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2391 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2392 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2393 // CHECK:   ret <4 x float> [[VCVT_N1]]
2394 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
2395   return vcvtq_n_f32_u32(a, 3);
2396 }
2397
2398 // CHECK-LABEL: @test_vcvt_n_s32_f32(
2399 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2400 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2401 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2402 // CHECK:   ret <2 x i32> [[VCVT_N1]]
2403 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
2404   return vcvt_n_s32_f32(a, 1);
2405 }
2406
2407 // CHECK-LABEL: @test_vcvtq_n_s32_f32(
2408 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2409 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2410 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2411 // CHECK:   ret <4 x i32> [[VCVT_N1]]
2412 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
2413   return vcvtq_n_s32_f32(a, 3);
2414 }
2415
2416 // CHECK-LABEL: @test_vcvt_n_u32_f32(
2417 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2418 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2419 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2420 // CHECK:   ret <2 x i32> [[VCVT_N1]]
2421 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
2422   return vcvt_n_u32_f32(a, 1);
2423 }
2424
2425 // CHECK-LABEL: @test_vcvtq_n_u32_f32(
2426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2427 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2428 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2429 // CHECK:   ret <4 x i32> [[VCVT_N1]]
2430 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
2431   return vcvtq_n_u32_f32(a, 3);
2432 }
2433
2434 // CHECK-LABEL: @test_vcvt_s32_f32(
2435 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2436 // CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32>
2437 // CHECK:   ret <2 x i32> [[VCVT_I]]
2438 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
2439   return vcvt_s32_f32(a);
2440 }
2441
2442 // CHECK-LABEL: @test_vcvtq_s32_f32(
2443 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2444 // CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32>
2445 // CHECK:   ret <4 x i32> [[VCVT_I]]
2446 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
2447   return vcvtq_s32_f32(a);
2448 }
2449
2450 // CHECK-LABEL: @test_vcvt_u32_f32(
2451 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2452 // CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32>
2453 // CHECK:   ret <2 x i32> [[VCVT_I]]
2454 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
2455   return vcvt_u32_f32(a);
2456 }
2457
2458 // CHECK-LABEL: @test_vcvtq_u32_f32(
2459 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2460 // CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32>
2461 // CHECK:   ret <4 x i32> [[VCVT_I]]
2462 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
2463   return vcvtq_u32_f32(a);
2464 }
2465
2466 // CHECK-LABEL: @test_vdup_lane_u8(
2467 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2468 // CHECK:   ret <8 x i8> [[SHUFFLE]]
2469 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
2470   return vdup_lane_u8(a, 7);
2471 }
2472
2473 // CHECK-LABEL: @test_vdup_lane_u16(
2474 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2475 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2476 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2477 // CHECK:   ret <4 x i16> [[LANE]]
2478 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
2479   return vdup_lane_u16(a, 3);
2480 }
2481
2482 // CHECK-LABEL: @test_vdup_lane_u32(
2483 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
2484 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2485 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
2486 // CHECK:   ret <2 x i32> [[LANE]]
2487 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
2488   return vdup_lane_u32(a, 1);
2489 }
2490
2491 // CHECK-LABEL: @test_vdup_lane_s8(
2492 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2493 // CHECK:   ret <8 x i8> [[SHUFFLE]]
2494 int8x8_t test_vdup_lane_s8(int8x8_t a) {
2495   return vdup_lane_s8(a, 7);
2496 }
2497
2498 // CHECK-LABEL: @test_vdup_lane_s16(
2499 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2500 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2501 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2502 // CHECK:   ret <4 x i16> [[LANE]]
2503 int16x4_t test_vdup_lane_s16(int16x4_t a) {
2504   return vdup_lane_s16(a, 3);
2505 }
2506
2507 // CHECK-LABEL: @test_vdup_lane_s32(
2508 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
2509 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2510 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
2511 // CHECK:   ret <2 x i32> [[LANE]]
2512 int32x2_t test_vdup_lane_s32(int32x2_t a) {
2513   return vdup_lane_s32(a, 1);
2514 }
2515
2516 // CHECK-LABEL: @test_vdup_lane_p8(
2517 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2518 // CHECK:   ret <8 x i8> [[SHUFFLE]]
2519 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
2520   return vdup_lane_p8(a, 7);
2521 }
2522
2523 // CHECK-LABEL: @test_vdup_lane_p16(
2524 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2525 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2526 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2527 // CHECK:   ret <4 x i16> [[LANE]]
2528 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
2529   return vdup_lane_p16(a, 3);
2530 }
2531
2532 // CHECK-LABEL: @test_vdup_lane_f32(
2533 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2534 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2535 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
2536 // CHECK:   ret <2 x float> [[LANE]]
2537 float32x2_t test_vdup_lane_f32(float32x2_t a) {
2538   return vdup_lane_f32(a, 1);
2539 }
2540
2541 // CHECK-LABEL: @test_vdupq_lane_u8(
2542 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2543 // CHECK:   ret <16 x i8> [[SHUFFLE]]
2544 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
2545   return vdupq_lane_u8(a, 7);
2546 }
2547
2548 // CHECK-LABEL: @test_vdupq_lane_u16(
2549 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2550 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2551 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2552 // CHECK:   ret <8 x i16> [[LANE]]
2553 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
2554   return vdupq_lane_u16(a, 3);
2555 }
2556
2557 // CHECK-LABEL: @test_vdupq_lane_u32(
2558 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
2559 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2560 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2561 // CHECK:   ret <4 x i32> [[LANE]]
2562 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
2563   return vdupq_lane_u32(a, 1);
2564 }
2565
2566 // CHECK-LABEL: @test_vdupq_lane_s8(
2567 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2568 // CHECK:   ret <16 x i8> [[SHUFFLE]]
2569 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
2570   return vdupq_lane_s8(a, 7);
2571 }
2572
2573 // CHECK-LABEL: @test_vdupq_lane_s16(
2574 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2575 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2576 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2577 // CHECK:   ret <8 x i16> [[LANE]]
2578 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
2579   return vdupq_lane_s16(a, 3);
2580 }
2581
2582 // CHECK-LABEL: @test_vdupq_lane_s32(
2583 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
2584 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2585 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2586 // CHECK:   ret <4 x i32> [[LANE]]
2587 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
2588   return vdupq_lane_s32(a, 1);
2589 }
2590
2591 // CHECK-LABEL: @test_vdupq_lane_p8(
2592 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2593 // CHECK:   ret <16 x i8> [[SHUFFLE]]
2594 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
2595   return vdupq_lane_p8(a, 7);
2596 }
2597
2598 // CHECK-LABEL: @test_vdupq_lane_p16(
2599 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2600 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2601 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2602 // CHECK:   ret <8 x i16> [[LANE]]
2603 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
2604   return vdupq_lane_p16(a, 3);
2605 }
2606
2607 // CHECK-LABEL: @test_vdupq_lane_f32(
2608 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2609 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2610 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2611 // CHECK:   ret <4 x float> [[LANE]]
2612 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
2613   return vdupq_lane_f32(a, 1);
2614 }
2615
2616 // CHECK-LABEL: @test_vdup_lane_s64(
2617 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
2618 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
2619 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
2620 // CHECK:   ret <1 x i64> [[LANE]]
2621 int64x1_t test_vdup_lane_s64(int64x1_t a) {
2622   return vdup_lane_s64(a, 0);
2623 }
2624
2625 // CHECK-LABEL: @test_vdup_lane_u64(
2626 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
2627 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
2628 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
2629 // CHECK:   ret <1 x i64> [[LANE]]
2630 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
2631   return vdup_lane_u64(a, 0);
2632 }
2633
2634 // CHECK-LABEL: @test_vdupq_lane_s64(
2635 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
2636 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
2637 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
2638 // CHECK:   ret <2 x i64> [[LANE]]
2639 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
2640   return vdupq_lane_s64(a, 0);
2641 }
2642
2643 // CHECK-LABEL: @test_vdupq_lane_u64(
2644 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
2645 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
2646 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
2647 // CHECK:   ret <2 x i64> [[LANE]]
2648 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
2649   return vdupq_lane_u64(a, 0);
2650 }
2651
2652 // CHECK-LABEL: @test_vdup_n_u8(
2653 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2654 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2655 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2656 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2657 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2658 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2659 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2660 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2661 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
2662 uint8x8_t test_vdup_n_u8(uint8_t a) {
2663   return vdup_n_u8(a);
2664 }
2665
2666 // CHECK-LABEL: @test_vdup_n_u16(
2667 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2668 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2669 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2670 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2671 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
2672 uint16x4_t test_vdup_n_u16(uint16_t a) {
2673   return vdup_n_u16(a);
2674 }
2675
2676 // CHECK-LABEL: @test_vdup_n_u32(
2677 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2678 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2679 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
2680 uint32x2_t test_vdup_n_u32(uint32_t a) {
2681   return vdup_n_u32(a);
2682 }
2683
2684 // CHECK-LABEL: @test_vdup_n_s8(
2685 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2686 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2687 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2688 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2689 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2690 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2691 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2692 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2693 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
2694 int8x8_t test_vdup_n_s8(int8_t a) {
2695   return vdup_n_s8(a);
2696 }
2697
2698 // CHECK-LABEL: @test_vdup_n_s16(
2699 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2700 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2701 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2702 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2703 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
2704 int16x4_t test_vdup_n_s16(int16_t a) {
2705   return vdup_n_s16(a);
2706 }
2707
2708 // CHECK-LABEL: @test_vdup_n_s32(
2709 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2710 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2711 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
2712 int32x2_t test_vdup_n_s32(int32_t a) {
2713   return vdup_n_s32(a);
2714 }
2715
2716 // CHECK-LABEL: @test_vdup_n_p8(
2717 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2718 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2719 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2720 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2721 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2722 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2723 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2724 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2725 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
2726 poly8x8_t test_vdup_n_p8(poly8_t a) {
2727   return vdup_n_p8(a);
2728 }
2729
2730 // CHECK-LABEL: @test_vdup_n_p16(
2731 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2732 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2733 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2734 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2735 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
2736 poly16x4_t test_vdup_n_p16(poly16_t a) {
2737   return vdup_n_p16(a);
2738 }
2739
2740 // CHECK-LABEL: @test_vdup_n_f16(
2741 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
2742 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
2743 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
2744 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
2745 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
2746 // CHECK:   ret <4 x half> [[VECINIT3]]
2747 float16x4_t test_vdup_n_f16(float16_t *a) {
2748   return vdup_n_f16(*a);
2749 }
2750
2751 // CHECK-LABEL: @test_vdup_n_f32(
2752 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
2753 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
2754 // CHECK:   ret <2 x float> [[VECINIT1_I]]
2755 float32x2_t test_vdup_n_f32(float32_t a) {
2756   return vdup_n_f32(a);
2757 }
2758
2759 // CHECK-LABEL: @test_vdupq_n_u8(
2760 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2761 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2762 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2763 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2764 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2765 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2766 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2767 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2768 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2769 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2770 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2771 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2772 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2773 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2774 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2775 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2776 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
2777 uint8x16_t test_vdupq_n_u8(uint8_t a) {
2778   return vdupq_n_u8(a);
2779 }
2780
2781 // CHECK-LABEL: @test_vdupq_n_u16(
2782 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2783 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2784 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2785 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2786 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2787 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2788 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2789 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2790 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
2791 uint16x8_t test_vdupq_n_u16(uint16_t a) {
2792   return vdupq_n_u16(a);
2793 }
2794
2795 // CHECK-LABEL: @test_vdupq_n_u32(
2796 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2797 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2798 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2799 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2800 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
2801 uint32x4_t test_vdupq_n_u32(uint32_t a) {
2802   return vdupq_n_u32(a);
2803 }
2804
2805 // CHECK-LABEL: @test_vdupq_n_s8(
2806 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2807 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2808 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2809 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2810 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2811 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2812 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2813 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2814 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2815 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2816 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2817 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2818 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2819 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2820 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2821 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2822 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
2823 int8x16_t test_vdupq_n_s8(int8_t a) {
2824   return vdupq_n_s8(a);
2825 }
2826
2827 // CHECK-LABEL: @test_vdupq_n_s16(
2828 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2829 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2830 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2831 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2832 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2833 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2834 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2835 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2836 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
2837 int16x8_t test_vdupq_n_s16(int16_t a) {
2838   return vdupq_n_s16(a);
2839 }
2840
2841 // CHECK-LABEL: @test_vdupq_n_s32(
2842 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2843 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2844 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2845 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2846 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
2847 int32x4_t test_vdupq_n_s32(int32_t a) {
2848   return vdupq_n_s32(a);
2849 }
2850
2851 // CHECK-LABEL: @test_vdupq_n_p8(
2852 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2853 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2854 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2855 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2856 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2857 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2858 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2859 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2860 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2861 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2862 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2863 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2864 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2865 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2866 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2867 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2868 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
2869 poly8x16_t test_vdupq_n_p8(poly8_t a) {
2870   return vdupq_n_p8(a);
2871 }
2872
2873 // CHECK-LABEL: @test_vdupq_n_p16(
2874 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2875 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2876 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2877 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2878 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2879 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2880 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2881 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2882 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
2883 poly16x8_t test_vdupq_n_p16(poly16_t a) {
2884   return vdupq_n_p16(a);
2885 }
2886
2887 // CHECK-LABEL: @test_vdupq_n_f16(
2888 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
2889 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
2890 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
2891 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
2892 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
2893 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
2894 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
2895 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
2896 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
2897 // CHECK:   ret <8 x half> [[VECINIT7]]
2898 float16x8_t test_vdupq_n_f16(float16_t *a) {
2899   return vdupq_n_f16(*a);
2900 }
2901
2902 // CHECK-LABEL: @test_vdupq_n_f32(
2903 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
2904 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
2905 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
2906 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
2907 // CHECK:   ret <4 x float> [[VECINIT3_I]]
2908 float32x4_t test_vdupq_n_f32(float32_t a) {
2909   return vdupq_n_f32(a);
2910 }
2911
2912 // CHECK-LABEL: @test_vdup_n_s64(
2913 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
2914 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
2915 // CHECK:   ret <1 x i64> [[ADD_I]]
2916 int64x1_t test_vdup_n_s64(int64_t a) {
2917   int64x1_t tmp = vdup_n_s64(a);
2918   return vadd_s64(tmp, tmp);
2919 }
2920
2921 // CHECK-LABEL: @test_vdup_n_u64(
2922 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
2923 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
2924 // CHECK:   ret <1 x i64> [[ADD_I]]
2925 int64x1_t test_vdup_n_u64(uint64_t a) {
2926   int64x1_t tmp = (int64x1_t)vdup_n_u64(a);
2927   return vadd_s64(tmp, tmp);
2928 }
2929
2930 // CHECK-LABEL: @test_vdupq_n_s64(
2931 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
2932 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
2933 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
2934 // CHECK:   ret <2 x i64> [[ADD_I]]
2935 int64x2_t test_vdupq_n_s64(int64_t a) {
2936   int64x2_t tmp = vdupq_n_s64(a);
2937   return vaddq_s64(tmp, tmp);
2938 }
2939
2940 // CHECK-LABEL: @test_vdupq_n_u64(
2941 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
2942 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
2943 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
2944 // CHECK:   ret <2 x i64> [[ADD_I]]
2945 uint64x2_t test_vdupq_n_u64(uint64_t a) {
2946   uint64x2_t tmp = vdupq_n_u64(a);
2947   return vaddq_u64(tmp, tmp);
2948 }
2949
2950 // CHECK-LABEL: @test_veor_s8(
2951 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
2952 // CHECK:   ret <8 x i8> [[XOR_I]]
2953 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
2954   return veor_s8(a, b);
2955 }
2956
2957 // CHECK-LABEL: @test_veor_s16(
2958 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
2959 // CHECK:   ret <4 x i16> [[XOR_I]]
2960 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
2961   return veor_s16(a, b);
2962 }
2963
2964 // CHECK-LABEL: @test_veor_s32(
2965 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
2966 // CHECK:   ret <2 x i32> [[XOR_I]]
2967 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
2968   return veor_s32(a, b);
2969 }
2970
2971 // CHECK-LABEL: @test_veor_s64(
2972 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
2973 // CHECK:   ret <1 x i64> [[XOR_I]]
2974 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
2975   return veor_s64(a, b);
2976 }
2977
2978 // CHECK-LABEL: @test_veor_u8(
2979 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
2980 // CHECK:   ret <8 x i8> [[XOR_I]]
2981 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
2982   return veor_u8(a, b);
2983 }
2984
2985 // CHECK-LABEL: @test_veor_u16(
2986 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
2987 // CHECK:   ret <4 x i16> [[XOR_I]]
2988 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
2989   return veor_u16(a, b);
2990 }
2991
2992 // CHECK-LABEL: @test_veor_u32(
2993 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
2994 // CHECK:   ret <2 x i32> [[XOR_I]]
2995 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
2996   return veor_u32(a, b);
2997 }
2998
2999 // CHECK-LABEL: @test_veor_u64(
3000 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
3001 // CHECK:   ret <1 x i64> [[XOR_I]]
3002 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
3003   return veor_u64(a, b);
3004 }
3005
3006 // CHECK-LABEL: @test_veorq_s8(
3007 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3008 // CHECK:   ret <16 x i8> [[XOR_I]]
3009 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
3010   return veorq_s8(a, b);
3011 }
3012
3013 // CHECK-LABEL: @test_veorq_s16(
3014 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3015 // CHECK:   ret <8 x i16> [[XOR_I]]
3016 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
3017   return veorq_s16(a, b);
3018 }
3019
3020 // CHECK-LABEL: @test_veorq_s32(
3021 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3022 // CHECK:   ret <4 x i32> [[XOR_I]]
3023 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
3024   return veorq_s32(a, b);
3025 }
3026
3027 // CHECK-LABEL: @test_veorq_s64(
3028 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3029 // CHECK:   ret <2 x i64> [[XOR_I]]
3030 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
3031   return veorq_s64(a, b);
3032 }
3033
3034 // CHECK-LABEL: @test_veorq_u8(
3035 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3036 // CHECK:   ret <16 x i8> [[XOR_I]]
3037 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
3038   return veorq_u8(a, b);
3039 }
3040
3041 // CHECK-LABEL: @test_veorq_u16(
3042 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3043 // CHECK:   ret <8 x i16> [[XOR_I]]
3044 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
3045   return veorq_u16(a, b);
3046 }
3047
3048 // CHECK-LABEL: @test_veorq_u32(
3049 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3050 // CHECK:   ret <4 x i32> [[XOR_I]]
3051 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
3052   return veorq_u32(a, b);
3053 }
3054
3055 // CHECK-LABEL: @test_veorq_u64(
3056 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3057 // CHECK:   ret <2 x i64> [[XOR_I]]
3058 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
3059   return veorq_u64(a, b);
3060 }
3061
3062 // CHECK-LABEL: @test_vext_s8(
3063 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3064 // CHECK:   ret <8 x i8> [[VEXT]]
3065 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
3066   return vext_s8(a, b, 7);
3067 }
3068
3069 // CHECK-LABEL: @test_vext_u8(
3070 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3071 // CHECK:   ret <8 x i8> [[VEXT]]
3072 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
3073   return vext_u8(a, b, 7);
3074 }
3075
3076 // CHECK-LABEL: @test_vext_p8(
3077 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3078 // CHECK:   ret <8 x i8> [[VEXT]]
3079 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
3080   return vext_p8(a, b, 7);
3081 }
3082
3083 // CHECK-LABEL: @test_vext_s16(
3084 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3085 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3086 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3087 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3088 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3089 // CHECK:   ret <4 x i16> [[VEXT]]
3090 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
3091   return vext_s16(a, b, 3);
3092 }
3093
3094 // CHECK-LABEL: @test_vext_u16(
3095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3096 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3097 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3098 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3099 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3100 // CHECK:   ret <4 x i16> [[VEXT]]
3101 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
3102   return vext_u16(a, b, 3);
3103 }
3104
3105 // CHECK-LABEL: @test_vext_p16(
3106 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3107 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3108 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3109 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3110 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3111 // CHECK:   ret <4 x i16> [[VEXT]]
3112 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
3113   return vext_p16(a, b, 3);
3114 }
3115
3116 // CHECK-LABEL: @test_vext_s32(
3117 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3118 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3119 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3120 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3121 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3122 // CHECK:   ret <2 x i32> [[VEXT]]
3123 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
3124   return vext_s32(a, b, 1);
3125 }
3126
3127 // CHECK-LABEL: @test_vext_u32(
3128 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3129 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3130 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3131 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3132 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3133 // CHECK:   ret <2 x i32> [[VEXT]]
3134 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
3135   return vext_u32(a, b, 1);
3136 }
3137
3138 // CHECK-LABEL: @test_vext_s64(
3139 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3140 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3141 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3142 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3143 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3144 // CHECK:   ret <1 x i64> [[VEXT]]
3145 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
3146   return vext_s64(a, b, 0);
3147 }
3148
3149 // CHECK-LABEL: @test_vext_u64(
3150 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3151 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3152 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3153 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3154 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3155 // CHECK:   ret <1 x i64> [[VEXT]]
3156 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
3157   return vext_u64(a, b, 0);
3158 }
3159
3160 // CHECK-LABEL: @test_vext_f32(
3161 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3162 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3163 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3164 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3165 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
3166 // CHECK:   ret <2 x float> [[VEXT]]
3167 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
3168   return vext_f32(a, b, 1);
3169 }
3170
3171 // CHECK-LABEL: @test_vextq_s8(
3172 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3173 // CHECK:   ret <16 x i8> [[VEXT]]
3174 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
3175   return vextq_s8(a, b, 15);
3176 }
3177
3178 // CHECK-LABEL: @test_vextq_u8(
3179 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3180 // CHECK:   ret <16 x i8> [[VEXT]]
3181 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
3182   return vextq_u8(a, b, 15);
3183 }
3184
3185 // CHECK-LABEL: @test_vextq_p8(
3186 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3187 // CHECK:   ret <16 x i8> [[VEXT]]
3188 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
3189   return vextq_p8(a, b, 15);
3190 }
3191
3192 // CHECK-LABEL: @test_vextq_s16(
3193 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3194 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3195 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3196 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3197 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3198 // CHECK:   ret <8 x i16> [[VEXT]]
3199 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
3200   return vextq_s16(a, b, 7);
3201 }
3202
3203 // CHECK-LABEL: @test_vextq_u16(
3204 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3205 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3206 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3207 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3208 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3209 // CHECK:   ret <8 x i16> [[VEXT]]
3210 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
3211   return vextq_u16(a, b, 7);
3212 }
3213
3214 // CHECK-LABEL: @test_vextq_p16(
3215 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3216 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3217 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3218 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3219 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3220 // CHECK:   ret <8 x i16> [[VEXT]]
3221 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
3222   return vextq_p16(a, b, 7);
3223 }
3224
3225 // CHECK-LABEL: @test_vextq_s32(
3226 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3227 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3228 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3229 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3230 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3231 // CHECK:   ret <4 x i32> [[VEXT]]
3232 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
3233   return vextq_s32(a, b, 3);
3234 }
3235
3236 // CHECK-LABEL: @test_vextq_u32(
3237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3238 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3239 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3240 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3241 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3242 // CHECK:   ret <4 x i32> [[VEXT]]
3243 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
3244   return vextq_u32(a, b, 3);
3245 }
3246
3247 // CHECK-LABEL: @test_vextq_s64(
3248 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3249 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3250 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3251 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3252 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3253 // CHECK:   ret <2 x i64> [[VEXT]]
3254 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
3255   return vextq_s64(a, b, 1);
3256 }
3257
3258 // CHECK-LABEL: @test_vextq_u64(
3259 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3260 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3261 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3262 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3263 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3264 // CHECK:   ret <2 x i64> [[VEXT]]
3265 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
3266   return vextq_u64(a, b, 1);
3267 }
3268
3269 // CHECK-LABEL: @test_vextq_f32(
3270 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3271 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3272 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3273 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3274 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3275 // CHECK:   ret <4 x float> [[VEXT]]
3276 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
3277   return vextq_f32(a, b, 3);
3278 }
3279
3280 // CHECK-LABEL: @test_vfma_f32(
3281 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3282 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3283 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3284 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a)
3285 // CHECK:   ret <2 x float> [[TMP3]]
3286 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3287   return vfma_f32(a, b, c);
3288 }
3289
3290 // CHECK-LABEL: @test_vfmaq_f32(
3291 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3292 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3293 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3294 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
3295 // CHECK:   ret <4 x float> [[TMP3]]
3296 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3297   return vfmaq_f32(a, b, c);
3298 }
3299
3300 // CHECK-LABEL: @test_vfms_f32(
3301 // CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %b
3302 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3303 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3304 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3305 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a)
3306 // CHECK:   ret <2 x float> [[TMP3]]
3307 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3308   return vfms_f32(a, b, c);
3309 }
3310
3311 // CHECK-LABEL: @test_vfmsq_f32(
3312 // CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %b
3313 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3314 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3315 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3316 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a)
3317 // CHECK:   ret <4 x float> [[TMP3]]
3318 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3319   return vfmsq_f32(a, b, c);
3320 }
3321
3322 // CHECK-LABEL: @test_vget_high_s8(
3323 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3324 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3325 int8x8_t test_vget_high_s8(int8x16_t a) {
3326   return vget_high_s8(a);
3327 }
3328
3329 // CHECK-LABEL: @test_vget_high_s16(
3330 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3331 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3332 int16x4_t test_vget_high_s16(int16x8_t a) {
3333   return vget_high_s16(a);
3334 }
3335
3336 // CHECK-LABEL: @test_vget_high_s32(
3337 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3338 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3339 int32x2_t test_vget_high_s32(int32x4_t a) {
3340   return vget_high_s32(a);
3341 }
3342
3343 // CHECK-LABEL: @test_vget_high_s64(
3344 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3345 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3346 int64x1_t test_vget_high_s64(int64x2_t a) {
3347   return vget_high_s64(a);
3348 }
3349
3350 // CHECK-LABEL: @test_vget_high_f16(
3351 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3352 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
3353 float16x4_t test_vget_high_f16(float16x8_t a) {
3354   return vget_high_f16(a);
3355 }
3356
3357 // CHECK-LABEL: @test_vget_high_f32(
3358 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
3359 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
3360 float32x2_t test_vget_high_f32(float32x4_t a) {
3361   return vget_high_f32(a);
3362 }
3363
3364 // CHECK-LABEL: @test_vget_high_u8(
3365 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3366 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3367 uint8x8_t test_vget_high_u8(uint8x16_t a) {
3368   return vget_high_u8(a);
3369 }
3370
3371 // CHECK-LABEL: @test_vget_high_u16(
3372 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3373 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3374 uint16x4_t test_vget_high_u16(uint16x8_t a) {
3375   return vget_high_u16(a);
3376 }
3377
3378 // CHECK-LABEL: @test_vget_high_u32(
3379 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3380 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3381 uint32x2_t test_vget_high_u32(uint32x4_t a) {
3382   return vget_high_u32(a);
3383 }
3384
3385 // CHECK-LABEL: @test_vget_high_u64(
3386 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3387 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3388 uint64x1_t test_vget_high_u64(uint64x2_t a) {
3389   return vget_high_u64(a);
3390 }
3391
3392 // CHECK-LABEL: @test_vget_high_p8(
3393 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3394 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3395 poly8x8_t test_vget_high_p8(poly8x16_t a) {
3396   return vget_high_p8(a);
3397 }
3398
3399 // CHECK-LABEL: @test_vget_high_p16(
3400 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3401 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3402 poly16x4_t test_vget_high_p16(poly16x8_t a) {
3403   return vget_high_p16(a);
3404 }
3405
3406 // CHECK-LABEL: @test_vget_lane_u8(
3407 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3408 // CHECK:   ret i8 [[VGET_LANE]]
3409 uint8_t test_vget_lane_u8(uint8x8_t a) {
3410   return vget_lane_u8(a, 7);
3411 }
3412
3413 // CHECK-LABEL: @test_vget_lane_u16(
3414 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
3415 // CHECK:   ret i16 [[VGET_LANE]]
3416 uint16_t test_vget_lane_u16(uint16x4_t a) {
3417   return vget_lane_u16(a, 3);
3418 }
3419
3420 // CHECK-LABEL: @test_vget_lane_u32(
3421 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1
3422 // CHECK:   ret i32 [[VGET_LANE]]
3423 uint32_t test_vget_lane_u32(uint32x2_t a) {
3424   return vget_lane_u32(a, 1);
3425 }
3426
3427 // CHECK-LABEL: @test_vget_lane_s8(
3428 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3429 // CHECK:   ret i8 [[VGET_LANE]]
3430 int8_t test_vget_lane_s8(int8x8_t a) {
3431   return vget_lane_s8(a, 7);
3432 }
3433
3434 // CHECK-LABEL: @test_vget_lane_s16(
3435 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
3436 // CHECK:   ret i16 [[VGET_LANE]]
3437 int16_t test_vget_lane_s16(int16x4_t a) {
3438   return vget_lane_s16(a, 3);
3439 }
3440
3441 // CHECK-LABEL: @test_vget_lane_s32(
3442 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1
3443 // CHECK:   ret i32 [[VGET_LANE]]
3444 int32_t test_vget_lane_s32(int32x2_t a) {
3445   return vget_lane_s32(a, 1);
3446 }
3447
3448 // CHECK-LABEL: @test_vget_lane_p8(
3449 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3450 // CHECK:   ret i8 [[VGET_LANE]]
3451 poly8_t test_vget_lane_p8(poly8x8_t a) {
3452   return vget_lane_p8(a, 7);
3453 }
3454
3455 // CHECK-LABEL: @test_vget_lane_p16(
3456 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
3457 // CHECK:   ret i16 [[VGET_LANE]]
3458 poly16_t test_vget_lane_p16(poly16x4_t a) {
3459   return vget_lane_p16(a, 3);
3460 }
3461
3462 // CHECK-LABEL: @test_vget_lane_f32(
3463 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %a, i32 1
3464 // CHECK:   ret float [[VGET_LANE]]
3465 float32_t test_vget_lane_f32(float32x2_t a) {
3466   return vget_lane_f32(a, 1);
3467 }
3468
3469 // CHECK-LABEL: @test_vget_lane_f16(
3470 // CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
3471 // CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
3472 // CHECK:   store <4 x half> %a, ptr [[__REINT_242]], align 8
3473 // CHECK:   [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_242]], align 8
3474 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
3475 // CHECK:   store i16 [[VGET_LANE]], ptr [[__REINT1_242]], align 2
3476 // CHECK:   [[TMP5:%.*]] = load half, ptr [[__REINT1_242]], align 2
3477 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3478 // CHECK:   ret float [[CONV]]
3479 float32_t test_vget_lane_f16(float16x4_t a) {
3480   return vget_lane_f16(a, 1);
3481 }
3482
3483 // CHECK-LABEL: @test_vgetq_lane_u8(
3484 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3485 // CHECK:   ret i8 [[VGET_LANE]]
3486 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
3487   return vgetq_lane_u8(a, 15);
3488 }
3489
3490 // CHECK-LABEL: @test_vgetq_lane_u16(
3491 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
3492 // CHECK:   ret i16 [[VGET_LANE]]
3493 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
3494   return vgetq_lane_u16(a, 7);
3495 }
3496
3497 // CHECK-LABEL: @test_vgetq_lane_u32(
3498 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3
3499 // CHECK:   ret i32 [[VGET_LANE]]
3500 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
3501   return vgetq_lane_u32(a, 3);
3502 }
3503
3504 // CHECK-LABEL: @test_vgetq_lane_s8(
3505 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3506 // CHECK:   ret i8 [[VGET_LANE]]
3507 int8_t test_vgetq_lane_s8(int8x16_t a) {
3508   return vgetq_lane_s8(a, 15);
3509 }
3510
3511 // CHECK-LABEL: @test_vgetq_lane_s16(
3512 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
3513 // CHECK:   ret i16 [[VGET_LANE]]
3514 int16_t test_vgetq_lane_s16(int16x8_t a) {
3515   return vgetq_lane_s16(a, 7);
3516 }
3517
3518 // CHECK-LABEL: @test_vgetq_lane_s32(
3519 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3
3520 // CHECK:   ret i32 [[VGET_LANE]]
3521 int32_t test_vgetq_lane_s32(int32x4_t a) {
3522   return vgetq_lane_s32(a, 3);
3523 }
3524
3525 // CHECK-LABEL: @test_vgetq_lane_p8(
3526 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3527 // CHECK:   ret i8 [[VGET_LANE]]
3528 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
3529   return vgetq_lane_p8(a, 15);
3530 }
3531
3532 // CHECK-LABEL: @test_vgetq_lane_p16(
3533 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
3534 // CHECK:   ret i16 [[VGET_LANE]]
3535 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
3536   return vgetq_lane_p16(a, 7);
3537 }
3538
3539 // CHECK-LABEL: @test_vgetq_lane_f32(
3540 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> %a, i32 3
3541 // CHECK:   ret float [[VGET_LANE]]
3542 float32_t test_vgetq_lane_f32(float32x4_t a) {
3543   return vgetq_lane_f32(a, 3);
3544 }
3545
3546 // CHECK-LABEL: @test_vgetq_lane_f16(
3547 // CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
3548 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
3549 // CHECK:   store <8 x half> %a, ptr [[__REINT_244]], align 16
3550 // CHECK:   [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_244]], align 16
3551 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
3552 // CHECK:   store i16 [[VGET_LANE]], ptr [[__REINT1_244]], align 2
3553 // CHECK:   [[TMP5:%.*]] = load half, ptr [[__REINT1_244]], align 2
3554 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3555 // CHECK:   ret float [[CONV]]
3556 float32_t test_vgetq_lane_f16(float16x8_t a) {
3557   return vgetq_lane_f16(a, 3);
3558 }
3559
3560 // CHECK-LABEL: @test_vget_lane_s64(
3561 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0
3562 // CHECK:   ret i64 [[VGET_LANE]]
3563 int64_t test_vget_lane_s64(int64x1_t a) {
3564   return vget_lane_s64(a, 0);
3565 }
3566
3567 // CHECK-LABEL: @test_vget_lane_u64(
3568 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0
3569 // CHECK:   ret i64 [[VGET_LANE]]
3570 uint64_t test_vget_lane_u64(uint64x1_t a) {
3571   return vget_lane_u64(a, 0);
3572 }
3573
3574 // CHECK-LABEL: @test_vgetq_lane_s64(
3575 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1
3576 // CHECK:   ret i64 [[VGET_LANE]]
3577 int64_t test_vgetq_lane_s64(int64x2_t a) {
3578   return vgetq_lane_s64(a, 1);
3579 }
3580
3581 // CHECK-LABEL: @test_vgetq_lane_u64(
3582 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1
3583 // CHECK:   ret i64 [[VGET_LANE]]
3584 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
3585   return vgetq_lane_u64(a, 1);
3586 }
3587
3588 // CHECK-LABEL: @test_vget_low_s8(
3589 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3590 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3591 int8x8_t test_vget_low_s8(int8x16_t a) {
3592   return vget_low_s8(a);
3593 }
3594
3595 // CHECK-LABEL: @test_vget_low_s16(
3596 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3597 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3598 int16x4_t test_vget_low_s16(int16x8_t a) {
3599   return vget_low_s16(a);
3600 }
3601
3602 // CHECK-LABEL: @test_vget_low_s32(
3603 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3604 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3605 int32x2_t test_vget_low_s32(int32x4_t a) {
3606   return vget_low_s32(a);
3607 }
3608
3609 // CHECK-LABEL: @test_vget_low_s64(
3610 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3611 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3612 int64x1_t test_vget_low_s64(int64x2_t a) {
3613   return vget_low_s64(a);
3614 }
3615
3616 // CHECK-LABEL: @test_vget_low_f16(
3617 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3618 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
3619 float16x4_t test_vget_low_f16(float16x8_t a) {
3620   return vget_low_f16(a);
3621 }
3622
3623 // CHECK-LABEL: @test_vget_low_f32(
3624 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
3625 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
3626 float32x2_t test_vget_low_f32(float32x4_t a) {
3627   return vget_low_f32(a);
3628 }
3629
3630 // CHECK-LABEL: @test_vget_low_u8(
3631 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3632 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3633 uint8x8_t test_vget_low_u8(uint8x16_t a) {
3634   return vget_low_u8(a);
3635 }
3636
3637 // CHECK-LABEL: @test_vget_low_u16(
3638 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3639 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3640 uint16x4_t test_vget_low_u16(uint16x8_t a) {
3641   return vget_low_u16(a);
3642 }
3643
3644 // CHECK-LABEL: @test_vget_low_u32(
3645 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3646 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3647 uint32x2_t test_vget_low_u32(uint32x4_t a) {
3648   return vget_low_u32(a);
3649 }
3650
3651 // CHECK-LABEL: @test_vget_low_u64(
3652 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3653 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3654 uint64x1_t test_vget_low_u64(uint64x2_t a) {
3655   return vget_low_u64(a);
3656 }
3657
3658 // CHECK-LABEL: @test_vget_low_p8(
3659 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3660 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3661 poly8x8_t test_vget_low_p8(poly8x16_t a) {
3662   return vget_low_p8(a);
3663 }
3664
3665 // CHECK-LABEL: @test_vget_low_p16(
3666 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3667 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3668 poly16x4_t test_vget_low_p16(poly16x8_t a) {
3669   return vget_low_p16(a);
3670 }
3671
3672 // CHECK-LABEL: @test_vhadd_s8(
3673 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
3674 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
3675 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
3676   return vhadd_s8(a, b);
3677 }
3678
3679 // CHECK-LABEL: @test_vhadd_s16(
3680 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3681 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3682 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
3683 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3684 // CHECK:   ret <4 x i16> [[VHADD_V2_I]]
3685 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
3686   return vhadd_s16(a, b);
3687 }
3688
3689 // CHECK-LABEL: @test_vhadd_s32(
3690 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3691 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3692 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
3693 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3694 // CHECK:   ret <2 x i32> [[VHADD_V2_I]]
3695 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
3696   return vhadd_s32(a, b);
3697 }
3698
3699 // CHECK-LABEL: @test_vhadd_u8(
3700 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
3701 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
3702 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
3703   return vhadd_u8(a, b);
3704 }
3705
3706 // CHECK-LABEL: @test_vhadd_u16(
3707 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3708 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3709 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
3710 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3711 // CHECK:   ret <4 x i16> [[VHADD_V2_I]]
3712 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
3713   return vhadd_u16(a, b);
3714 }
3715
3716 // CHECK-LABEL: @test_vhadd_u32(
3717 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3718 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3719 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
3720 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3721 // CHECK:   ret <2 x i32> [[VHADD_V2_I]]
3722 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
3723   return vhadd_u32(a, b);
3724 }
3725
3726 // CHECK-LABEL: @test_vhaddq_s8(
3727 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
3728 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
3729 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
3730   return vhaddq_s8(a, b);
3731 }
3732
3733 // CHECK-LABEL: @test_vhaddq_s16(
3734 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3735 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3736 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
3737 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3738 // CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
3739 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
3740   return vhaddq_s16(a, b);
3741 }
3742
3743 // CHECK-LABEL: @test_vhaddq_s32(
3744 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3745 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3746 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
3747 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3748 // CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
3749 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
3750   return vhaddq_s32(a, b);
3751 }
3752
3753 // CHECK-LABEL: @test_vhaddq_u8(
3754 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
3755 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
3756 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
3757   return vhaddq_u8(a, b);
3758 }
3759
3760 // CHECK-LABEL: @test_vhaddq_u16(
3761 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3762 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3763 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
3764 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3765 // CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
3766 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
3767   return vhaddq_u16(a, b);
3768 }
3769
3770 // CHECK-LABEL: @test_vhaddq_u32(
3771 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3772 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3773 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
3774 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3775 // CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
3776 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
3777   return vhaddq_u32(a, b);
3778 }
3779
3780 // CHECK-LABEL: @test_vhsub_s8(
3781 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
3782 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
3783 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
3784   return vhsub_s8(a, b);
3785 }
3786
3787 // CHECK-LABEL: @test_vhsub_s16(
3788 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3789 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3790 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
3791 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
3792 // CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
3793 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
3794   return vhsub_s16(a, b);
3795 }
3796
3797 // CHECK-LABEL: @test_vhsub_s32(
3798 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3799 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3800 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
3801 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
3802 // CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
3803 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
3804   return vhsub_s32(a, b);
3805 }
3806
3807 // CHECK-LABEL: @test_vhsub_u8(
3808 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
3809 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
3810 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
3811   return vhsub_u8(a, b);
3812 }
3813
3814 // CHECK-LABEL: @test_vhsub_u16(
3815 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3816 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3817 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
3818 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
3819 // CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
3820 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
3821   return vhsub_u16(a, b);
3822 }
3823
3824 // CHECK-LABEL: @test_vhsub_u32(
3825 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3826 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3827 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
3828 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
3829 // CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
3830 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
3831   return vhsub_u32(a, b);
3832 }
3833
3834 // CHECK-LABEL: @test_vhsubq_s8(
3835 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
3836 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
3837 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
3838   return vhsubq_s8(a, b);
3839 }
3840
3841 // CHECK-LABEL: @test_vhsubq_s16(
3842 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3843 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3844 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
3845 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
3846 // CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
3847 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
3848   return vhsubq_s16(a, b);
3849 }
3850
3851 // CHECK-LABEL: @test_vhsubq_s32(
3852 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3853 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3854 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
3855 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
3856 // CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
3857 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
3858   return vhsubq_s32(a, b);
3859 }
3860
3861 // CHECK-LABEL: @test_vhsubq_u8(
3862 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
3863 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
3864 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
3865   return vhsubq_u8(a, b);
3866 }
3867
3868 // CHECK-LABEL: @test_vhsubq_u16(
3869 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3870 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3871 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
3872 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
3873 // CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
3874 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
3875   return vhsubq_u16(a, b);
3876 }
3877
3878 // CHECK-LABEL: @test_vhsubq_u32(
3879 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3880 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3881 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
3882 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
3883 // CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
3884 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
3885   return vhsubq_u32(a, b);
3886 }
3887
3888 // CHECK-LABEL: @test_vld1q_u8(
3889 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
3890 // CHECK:   ret <16 x i8> [[VLD1]]
3891 uint8x16_t test_vld1q_u8(uint8_t const * a) {
3892   return vld1q_u8(a);
3893 }
3894
3895 // CHECK-LABEL: @test_vld1q_u16(
3896 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
3897 // CHECK:   ret <8 x i16> [[VLD1]]
3898 uint16x8_t test_vld1q_u16(uint16_t const * a) {
3899   return vld1q_u16(a);
3900 }
3901
3902 // CHECK-LABEL: @test_vld1q_u32(
3903 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4)
3904 // CHECK:   ret <4 x i32> [[VLD1]]
3905 uint32x4_t test_vld1q_u32(uint32_t const * a) {
3906   return vld1q_u32(a);
3907 }
3908
3909 // CHECK-LABEL: @test_vld1q_u64(
3910 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4)
3911 // CHECK:   ret <2 x i64> [[VLD1]]
3912 uint64x2_t test_vld1q_u64(uint64_t const * a) {
3913   return vld1q_u64(a);
3914 }
3915
3916 // CHECK-LABEL: @test_vld1q_s8(
3917 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
3918 // CHECK:   ret <16 x i8> [[VLD1]]
3919 int8x16_t test_vld1q_s8(int8_t const * a) {
3920   return vld1q_s8(a);
3921 }
3922
3923 // CHECK-LABEL: @test_vld1q_s16(
3924 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
3925 // CHECK:   ret <8 x i16> [[VLD1]]
3926 int16x8_t test_vld1q_s16(int16_t const * a) {
3927   return vld1q_s16(a);
3928 }
3929
3930 // CHECK-LABEL: @test_vld1q_s32(
3931 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4)
3932 // CHECK:   ret <4 x i32> [[VLD1]]
3933 int32x4_t test_vld1q_s32(int32_t const * a) {
3934   return vld1q_s32(a);
3935 }
3936
3937 // CHECK-LABEL: @test_vld1q_s64(
3938 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4)
3939 // CHECK:   ret <2 x i64> [[VLD1]]
3940 int64x2_t test_vld1q_s64(int64_t const * a) {
3941   return vld1q_s64(a);
3942 }
3943
3944 // CHECK-LABEL: @test_vld1q_f16(
3945 // CHECK:   [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr %a, i32 2)
3946 // CHECK:   ret <8 x half> [[VLD1]]
3947 float16x8_t test_vld1q_f16(float16_t const * a) {
3948   return vld1q_f16(a);
3949 }
3950
3951 // CHECK-LABEL: @test_vld1q_f32(
3952 // CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %a, i32 4)
3953 // CHECK:   ret <4 x float> [[VLD1]]
3954 float32x4_t test_vld1q_f32(float32_t const * a) {
3955   return vld1q_f32(a);
3956 }
3957
3958 // CHECK-LABEL: @test_vld1q_p8(
3959 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
3960 // CHECK:   ret <16 x i8> [[VLD1]]
3961 poly8x16_t test_vld1q_p8(poly8_t const * a) {
3962   return vld1q_p8(a);
3963 }
3964
3965 // CHECK-LABEL: @test_vld1q_p16(
3966 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
3967 // CHECK:   ret <8 x i16> [[VLD1]]
3968 poly16x8_t test_vld1q_p16(poly16_t const * a) {
3969   return vld1q_p16(a);
3970 }
3971
3972 // CHECK-LABEL: @test_vld1_u8(
3973 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
3974 // CHECK:   ret <8 x i8> [[VLD1]]
3975 uint8x8_t test_vld1_u8(uint8_t const * a) {
3976   return vld1_u8(a);
3977 }
3978
3979 // CHECK-LABEL: @test_vld1_u16(
3980 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
3981 // CHECK:   ret <4 x i16> [[VLD1]]
3982 uint16x4_t test_vld1_u16(uint16_t const * a) {
3983   return vld1_u16(a);
3984 }
3985
3986 // CHECK-LABEL: @test_vld1_u32(
3987 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4)
3988 // CHECK:   ret <2 x i32> [[VLD1]]
3989 uint32x2_t test_vld1_u32(uint32_t const * a) {
3990   return vld1_u32(a);
3991 }
3992
3993 // CHECK-LABEL: @test_vld1_u64(
3994 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
3995 // CHECK:   ret <1 x i64> [[VLD1]]
3996 uint64x1_t test_vld1_u64(uint64_t const * a) {
3997   return vld1_u64(a);
3998 }
3999
4000 // CHECK-LABEL: @test_vld1_s8(
4001 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
4002 // CHECK:   ret <8 x i8> [[VLD1]]
4003 int8x8_t test_vld1_s8(int8_t const * a) {
4004   return vld1_s8(a);
4005 }
4006
4007 // CHECK-LABEL: @test_vld1_s16(
4008 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
4009 // CHECK:   ret <4 x i16> [[VLD1]]
4010 int16x4_t test_vld1_s16(int16_t const * a) {
4011   return vld1_s16(a);
4012 }
4013
4014 // CHECK-LABEL: @test_vld1_s32(
4015 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4)
4016 // CHECK:   ret <2 x i32> [[VLD1]]
4017 int32x2_t test_vld1_s32(int32_t const * a) {
4018   return vld1_s32(a);
4019 }
4020
4021 // CHECK-LABEL: @test_vld1_s64(
4022 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
4023 // CHECK:   ret <1 x i64> [[VLD1]]
4024 int64x1_t test_vld1_s64(int64_t const * a) {
4025   return vld1_s64(a);
4026 }
4027
4028 // CHECK-LABEL: @test_vld1_f16(
4029 // CHECK:   [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr %a, i32 2)
4030 // CHECK:   ret <4 x half> [[VLD1]]
4031 float16x4_t test_vld1_f16(float16_t const * a) {
4032   return vld1_f16(a);
4033 }
4034
4035 // CHECK-LABEL: @test_vld1_f32(
4036 // CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr %a, i32 4)
4037 // CHECK:   ret <2 x float> [[VLD1]]
4038 float32x2_t test_vld1_f32(float32_t const * a) {
4039   return vld1_f32(a);
4040 }
4041
4042 // CHECK-LABEL: @test_vld1_p8(
4043 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
4044 // CHECK:   ret <8 x i8> [[VLD1]]
4045 poly8x8_t test_vld1_p8(poly8_t const * a) {
4046   return vld1_p8(a);
4047 }
4048
4049 // CHECK-LABEL: @test_vld1_p16(
4050 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
4051 // CHECK:   ret <4 x i16> [[VLD1]]
4052 poly16x4_t test_vld1_p16(poly16_t const * a) {
4053   return vld1_p16(a);
4054 }
4055
4056 // CHECK-LABEL: @test_vld1q_dup_u8(
4057 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4058 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
4059 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4060 // CHECK:   ret <16 x i8> [[LANE]]
4061 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
4062   return vld1q_dup_u8(a);
4063 }
4064
4065 // CHECK-LABEL: @test_vld1q_dup_u16(
4066 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4067 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
4068 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4069 // CHECK:   ret <8 x i16> [[LANE]]
4070 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
4071   return vld1q_dup_u16(a);
4072 }
4073
4074 // CHECK-LABEL: @test_vld1q_dup_u32(
4075 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
4076 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
4077 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4078 // CHECK:   ret <4 x i32> [[LANE]]
4079 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
4080   return vld1q_dup_u32(a);
4081 }
4082
4083 // CHECK-LABEL: @test_vld1q_dup_u64(
4084 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
4085 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
4086 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4087 // CHECK:   ret <2 x i64> [[LANE]]
4088 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
4089   return vld1q_dup_u64(a);
4090 }
4091
4092 // CHECK-LABEL: @test_vld1q_dup_s8(
4093 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4094 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
4095 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4096 // CHECK:   ret <16 x i8> [[LANE]]
4097 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
4098   return vld1q_dup_s8(a);
4099 }
4100
4101 // CHECK-LABEL: @test_vld1q_dup_s16(
4102 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4103 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
4104 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4105 // CHECK:   ret <8 x i16> [[LANE]]
4106 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
4107   return vld1q_dup_s16(a);
4108 }
4109
4110 // CHECK-LABEL: @test_vld1q_dup_s32(
4111 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
4112 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
4113 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4114 // CHECK:   ret <4 x i32> [[LANE]]
4115 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
4116   return vld1q_dup_s32(a);
4117 }
4118
4119 // CHECK-LABEL: @test_vld1q_dup_s64(
4120 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
4121 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
4122 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4123 // CHECK:   ret <2 x i64> [[LANE]]
4124 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
4125   return vld1q_dup_s64(a);
4126 }
4127
4128 // CHECK-LABEL: @test_vld1q_dup_f16(
4129 // CHECK:   [[TMP2:%.*]] = load half, ptr %a, align 2
4130 // CHECK:   [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
4131 // CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
4132 // CHECK:   ret <8 x half> [[LANE]]
4133 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
4134   return vld1q_dup_f16(a);
4135 }
4136
4137 // CHECK-LABEL: @test_vld1q_dup_f32(
4138 // CHECK:   [[TMP2:%.*]] = load float, ptr %a, align 4
4139 // CHECK:   [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
4140 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
4141 // CHECK:   ret <4 x float> [[LANE]]
4142 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
4143   return vld1q_dup_f32(a);
4144 }
4145
4146 // CHECK-LABEL: @test_vld1q_dup_p8(
4147 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4148 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
4149 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4150 // CHECK:   ret <16 x i8> [[LANE]]
4151 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
4152   return vld1q_dup_p8(a);
4153 }
4154
4155 // CHECK-LABEL: @test_vld1q_dup_p16(
4156 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4157 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
4158 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4159 // CHECK:   ret <8 x i16> [[LANE]]
4160 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
4161   return vld1q_dup_p16(a);
4162 }
4163
4164 // CHECK-LABEL: @test_vld1_dup_u8(
4165 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4166 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
4167 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4168 // CHECK:   ret <8 x i8> [[LANE]]
4169 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
4170   return vld1_dup_u8(a);
4171 }
4172
4173 // CHECK-LABEL: @test_vld1_dup_u16(
4174 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4175 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
4176 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4177 // CHECK:   ret <4 x i16> [[LANE]]
4178 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
4179   return vld1_dup_u16(a);
4180 }
4181
4182 // CHECK-LABEL: @test_vld1_dup_u32(
4183 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
4184 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
4185 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4186 // CHECK:   ret <2 x i32> [[LANE]]
4187 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
4188   return vld1_dup_u32(a);
4189 }
4190
4191 // CHECK-LABEL: @test_vld1_dup_u64(
4192 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
4193 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
4194 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4195 // CHECK:   ret <1 x i64> [[LANE]]
4196 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
4197   return vld1_dup_u64(a);
4198 }
4199
4200 // CHECK-LABEL: @test_vld1_dup_s8(
4201 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4202 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
4203 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4204 // CHECK:   ret <8 x i8> [[LANE]]
4205 int8x8_t test_vld1_dup_s8(int8_t const * a) {
4206   return vld1_dup_s8(a);
4207 }
4208
4209 // CHECK-LABEL: @test_vld1_dup_s16(
4210 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4211 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
4212 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4213 // CHECK:   ret <4 x i16> [[LANE]]
4214 int16x4_t test_vld1_dup_s16(int16_t const * a) {
4215   return vld1_dup_s16(a);
4216 }
4217
4218 // CHECK-LABEL: @test_vld1_dup_s32(
4219 // CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
4220 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
4221 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4222 // CHECK:   ret <2 x i32> [[LANE]]
4223 int32x2_t test_vld1_dup_s32(int32_t const * a) {
4224   return vld1_dup_s32(a);
4225 }
4226
4227 // CHECK-LABEL: @test_vld1_dup_s64(
4228 // CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
4229 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
4230 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4231 // CHECK:   ret <1 x i64> [[LANE]]
4232 int64x1_t test_vld1_dup_s64(int64_t const * a) {
4233   return vld1_dup_s64(a);
4234 }
4235
4236 // CHECK-LABEL: @test_vld1_dup_f16(
4237 // CHECK:   [[TMP2:%.*]] = load half, ptr %a, align 2
4238 // CHECK:   [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
4239 // CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
4240 // CHECK:   ret <4 x half> [[LANE]]
4241 float16x4_t test_vld1_dup_f16(float16_t const * a) {
4242   return vld1_dup_f16(a);
4243 }
4244
4245 // CHECK-LABEL: @test_vld1_dup_f32(
4246 // CHECK:   [[TMP2:%.*]] = load float, ptr %a, align 4
4247 // CHECK:   [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
4248 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
4249 // CHECK:   ret <2 x float> [[LANE]]
4250 float32x2_t test_vld1_dup_f32(float32_t const * a) {
4251   return vld1_dup_f32(a);
4252 }
4253
4254 // CHECK-LABEL: @test_vld1_dup_p8(
4255 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4256 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
4257 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4258 // CHECK:   ret <8 x i8> [[LANE]]
4259 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
4260   return vld1_dup_p8(a);
4261 }
4262
4263 // CHECK-LABEL: @test_vld1_dup_p16(
4264 // CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
4265 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
4266 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4267 // CHECK:   ret <4 x i16> [[LANE]]
4268 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
4269   return vld1_dup_p16(a);
4270 }
4271
4272 // CHECK-LABEL: @test_vld1q_lane_u8(
4273 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4274 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4275 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
4276 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
4277   return vld1q_lane_u8(a, b, 15);
4278 }
4279
4280 // CHECK-LABEL: @test_vld1q_lane_u16(
4281 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4282 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4283 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4284 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4285 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
4286 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
4287   return vld1q_lane_u16(a, b, 7);
4288 }
4289
4290 // CHECK-LABEL: @test_vld1q_lane_u32(
4291 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4292 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4293 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
4294 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4295 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
4296 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
4297   return vld1q_lane_u32(a, b, 3);
4298 }
4299
4300 // CHECK-LABEL: @test_vld1q_lane_u64(
4301 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4302 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4303 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4304 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
4305 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4306 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
4307 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
4308   return vld1q_lane_u64(a, b, 1);
4309 }
4310
4311 // CHECK-LABEL: @test_vld1q_lane_s8(
4312 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4313 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4314 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
4315 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
4316   return vld1q_lane_s8(a, b, 15);
4317 }
4318
4319 // CHECK-LABEL: @test_vld1q_lane_s16(
4320 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4321 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4322 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4323 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4324 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
4325 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
4326   return vld1q_lane_s16(a, b, 7);
4327 }
4328
4329 // CHECK-LABEL: @test_vld1q_lane_s32(
4330 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4331 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4332 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
4333 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4334 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
4335 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
4336   return vld1q_lane_s32(a, b, 3);
4337 }
4338
4339 // CHECK-LABEL: @test_vld1q_lane_s64(
4340 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4341 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4342 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4343 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
4344 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4345 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
4346 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
4347   return vld1q_lane_s64(a, b, 1);
4348 }
4349
4350 // CHECK-LABEL: @test_vld1q_lane_f16(
4351 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
4352 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
4353 // CHECK:   [[TMP4:%.*]] = load half, ptr %a, align 2
4354 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
4355 // CHECK:   ret <8 x half> [[VLD1_LANE]]
4356 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
4357   return vld1q_lane_f16(a, b, 7);
4358 }
4359
4360 // CHECK-LABEL: @test_vld1q_lane_f32(
4361 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
4362 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
4363 // CHECK:   [[TMP4:%.*]] = load float, ptr %a, align 4
4364 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
4365 // CHECK:   ret <4 x float> [[VLD1_LANE]]
4366 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
4367   return vld1q_lane_f32(a, b, 3);
4368 }
4369
4370 // CHECK-LABEL: @test_vld1q_lane_p8(
4371 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4372 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4373 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
4374 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
4375   return vld1q_lane_p8(a, b, 15);
4376 }
4377
4378 // CHECK-LABEL: @test_vld1q_lane_p16(
4379 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4380 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4381 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4382 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4383 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
4384 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
4385   return vld1q_lane_p16(a, b, 7);
4386 }
4387
4388 // CHECK-LABEL: @test_vld1_lane_u8(
4389 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4390 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4391 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
4392 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
4393   return vld1_lane_u8(a, b, 7);
4394 }
4395
4396 // CHECK-LABEL: @test_vld1_lane_u16(
4397 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4398 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4399 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4400 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4401 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
4402 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
4403   return vld1_lane_u16(a, b, 3);
4404 }
4405
4406 // CHECK-LABEL: @test_vld1_lane_u32(
4407 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4408 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4409 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
4410 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4411 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
4412 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
4413   return vld1_lane_u32(a, b, 1);
4414 }
4415
4416 // CHECK-LABEL: @test_vld1_lane_u64(
4417 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4418 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4419 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a, align 4
4420 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4421 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
4422 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
4423   return vld1_lane_u64(a, b, 0);
4424 }
4425
4426 // CHECK-LABEL: @test_vld1_lane_s8(
4427 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4428 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4429 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
4430 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
4431   return vld1_lane_s8(a, b, 7);
4432 }
4433
4434 // CHECK-LABEL: @test_vld1_lane_s16(
4435 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4436 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4437 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4438 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4439 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
4440 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
4441   return vld1_lane_s16(a, b, 3);
4442 }
4443
4444 // CHECK-LABEL: @test_vld1_lane_s32(
4445 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4446 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4447 // CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
4448 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4449 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
4450 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
4451   return vld1_lane_s32(a, b, 1);
4452 }
4453
4454 // CHECK-LABEL: @test_vld1_lane_s64(
4455 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4456 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4457 // CHECK:   [[TMP4:%.*]] = load i64, ptr %a, align 4
4458 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4459 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
4460 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
4461   return vld1_lane_s64(a, b, 0);
4462 }
4463
4464 // CHECK-LABEL: @test_vld1_lane_f16(
4465 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
4466 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
4467 // CHECK:   [[TMP4:%.*]] = load half, ptr %a, align 2
4468 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
4469 // CHECK:   ret <4 x half> [[VLD1_LANE]]
4470 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
4471   return vld1_lane_f16(a, b, 3);
4472 }
4473
4474 // CHECK-LABEL: @test_vld1_lane_f32(
4475 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
4476 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
4477 // CHECK:   [[TMP4:%.*]] = load float, ptr %a, align 4
4478 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
4479 // CHECK:   ret <2 x float> [[VLD1_LANE]]
4480 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
4481   return vld1_lane_f32(a, b, 1);
4482 }
4483
4484 // CHECK-LABEL: @test_vld1_lane_p8(
4485 // CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
4486 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4487 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
4488 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
4489   return vld1_lane_p8(a, b, 7);
4490 }
4491
4492 // CHECK-LABEL: @test_vld1_lane_p16(
4493 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4494 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4495 // CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
4496 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4497 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
4498 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
4499   return vld1_lane_p16(a, b, 3);
4500 }
4501
4502 // CHECK-LABEL: @test_vld2q_u8(
4503 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
4504 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4505 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
4506   return vld2q_u8(a);
4507 }
4508
4509 // CHECK-LABEL: @test_vld2q_u16(
4510 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4511 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4512 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
4513   return vld2q_u16(a);
4514 }
4515
4516 // CHECK-LABEL: @test_vld2q_u32(
4517 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4518 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
4519 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
4520   return vld2q_u32(a);
4521 }
4522
4523 // CHECK-LABEL: @test_vld2q_s8(
4524 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
4525 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4526 int8x16x2_t test_vld2q_s8(int8_t const * a) {
4527   return vld2q_s8(a);
4528 }
4529
4530 // CHECK-LABEL: @test_vld2q_s16(
4531 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4532 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4533 int16x8x2_t test_vld2q_s16(int16_t const * a) {
4534   return vld2q_s16(a);
4535 }
4536
4537 // CHECK-LABEL: @test_vld2q_s32(
4538 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4539 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
4540 int32x4x2_t test_vld2q_s32(int32_t const * a) {
4541   return vld2q_s32(a);
4542 }
4543
4544 // CHECK-LABEL: @test_vld2q_f16(
4545 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4546 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half>
4547 float16x8x2_t test_vld2q_f16(float16_t const * a) {
4548   return vld2q_f16(a);
4549 }
4550
4551 // CHECK-LABEL: @test_vld2q_f32(
4552 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4553 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
4554 float32x4x2_t test_vld2q_f32(float32_t const * a) {
4555   return vld2q_f32(a);
4556 }
4557
4558 // CHECK-LABEL: @test_vld2q_p8(
4559 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
4560 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4561 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
4562   return vld2q_p8(a);
4563 }
4564
4565 // CHECK-LABEL: @test_vld2q_p16(
4566 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4567 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4568 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
4569   return vld2q_p16(a);
4570 }
4571
4572 // CHECK-LABEL: @test_vld2_u8(
4573 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4574 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4575 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
4576   return vld2_u8(a);
4577 }
4578
4579 // CHECK-LABEL: @test_vld2_u16(
4580 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4581 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4582 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
4583   return vld2_u16(a);
4584 }
4585
4586 // CHECK-LABEL: @test_vld2_u32(
4587 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
4588 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
4589 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
4590   return vld2_u32(a);
4591 }
4592
4593 // CHECK-LABEL: @test_vld2_u64(
4594 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
4595 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
4596 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
4597   return vld2_u64(a);
4598 }
4599
4600 // CHECK-LABEL: @test_vld2_s8(
4601 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
4602 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4603 int8x8x2_t test_vld2_s8(int8_t const * a) {
4604   return vld2_s8(a);
4605 }
4606
4607 // CHECK-LABEL: @test_vld2_s16(
4608 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
4609 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4610 int16x4x2_t test_vld2_s16(int16_t const * a) {
4611   return vld2_s16(a);
4612 }
4613
4614 // CHECK-LABEL: @test_vld2_s32(
4615 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
4616 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
4617 int32x2x2_t test_vld2_s32(int32_t const * a) {
4618   return vld2_s32(a);
4619 }
4620
4621 // CHECK-LABEL: @test_vld2_s64(
4622 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
4623 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
4624 int64x1x2_t test_vld2_s64(int64_t const * a) {
4625   return vld2_s64(a);
4626 }
4627
4628 // CHECK-LABEL: @test_vld2_f16(
4629 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
4630 // CHECK:   [[VLD2_V:%.*]] = call { <4 x half>, <4 x half>
4631 float16x4x2_t test_vld2_f16(float16_t const * a) {
4632   return vld2_f16(a);
4633 }
4634
4635 // CHECK-LABEL: @test_vld2_f32(
4636 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
4637 // CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float>
4638 float32x2x2_t test_vld2_f32(float32_t const * a) {
4639   return vld2_f32(a);
4640 }
4641
4642 // CHECK-LABEL: @test_vld2_p8(
4643 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
4644 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4645 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
4646   return vld2_p8(a);
4647 }
4648
4649 // CHECK-LABEL: @test_vld2_p16(
4650 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
4651 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4652 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
4653   return vld2_p16(a);
4654 }
4655
4656 // CHECK-LABEL: @test_vld2q_lane_u16(
4657 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
4658 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4659 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
4660 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
4661 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4662 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4663 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
4664 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
4665 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4666 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4667 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
4668 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
4669 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4670 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4671 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4672 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4673 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4674 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
4675   return vld2q_lane_u16(a, b, 7);
4676 }
4677
4678 // CHECK-LABEL: @test_vld2q_lane_u32(
4679 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
4680 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4681 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
4682 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
4683 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4684 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4685 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
4686 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
4687 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4688 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4689 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
4690 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
4691 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4692 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4693 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4694 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4695 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
4696 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
4697   return vld2q_lane_u32(a, b, 3);
4698 }
4699
4700 // CHECK-LABEL: @test_vld2q_lane_s16(
4701 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
4702 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4703 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
4704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
4705 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4706 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4707 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
4708 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
4709 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4710 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4711 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
4712 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
4713 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4714 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4715 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4716 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4717 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4718 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
4719   return vld2q_lane_s16(a, b, 7);
4720 }
4721
4722 // CHECK-LABEL: @test_vld2q_lane_s32(
4723 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
4724 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4725 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
4726 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
4727 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4728 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4729 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
4730 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
4731 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
4732 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4733 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
4734 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
4735 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
4736 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4737 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4738 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4739 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
4740 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
4741   return vld2q_lane_s32(a, b, 3);
4742 }
4743
4744 // CHECK-LABEL: @test_vld2q_lane_f16(
4745 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
4746 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4747 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
4748 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
4749 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4750 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4751 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
4752 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
4753 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
4754 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4755 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
4756 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
4757 // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
4758 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
4759 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4760 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
4761 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>
4762 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
4763   return vld2q_lane_f16(a, b, 7);
4764 }
4765
4766 // CHECK-LABEL: @test_vld2q_lane_f32(
4767 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
4768 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4769 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
4770 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
4771 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4772 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4773 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
4774 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
4775 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
4776 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4777 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
4778 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
4779 // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
4780 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
4781 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4782 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
4783 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>
4784 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
4785   return vld2q_lane_f32(a, b, 3);
4786 }
4787
4788 // CHECK-LABEL: @test_vld2q_lane_p16(
4789 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
4790 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4791 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
4792 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
4793 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
4794 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
4795 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
4796 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
4797 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
4798 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4799 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
4800 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
4801 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
4802 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4803 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4804 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4805 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4806 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
4807   return vld2q_lane_p16(a, b, 7);
4808 }
4809
4810 // CHECK-LABEL: @test_vld2_lane_u8(
4811 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
4812 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4813 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
4814 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
4815 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4816 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4817 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
4818 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
4819 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4820 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
4821 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
4822 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4823 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
4824 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
4825   return vld2_lane_u8(a, b, 7);
4826 }
4827
4828 // CHECK-LABEL: @test_vld2_lane_u16(
4829 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
4830 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4831 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
4832 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
4833 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4834 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4835 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
4836 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
4837 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4838 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4839 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
4840 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
4841 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4842 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4843 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4844 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4845 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
4846 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
4847   return vld2_lane_u16(a, b, 3);
4848 }
4849
4850 // CHECK-LABEL: @test_vld2_lane_u32(
4851 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
4852 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
4853 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
4854 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
4855 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4856 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4857 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
4858 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
4859 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4860 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4861 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
4862 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
4863 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4864 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4865 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4866 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4867 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
4868 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
4869   return vld2_lane_u32(a, b, 1);
4870 }
4871
4872 // CHECK-LABEL: @test_vld2_lane_s8(
4873 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
4874 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
4875 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
4876 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
4877 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4878 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4879 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
4880 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
4881 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4882 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
4883 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
4884 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4885 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
4886 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
4887   return vld2_lane_s8(a, b, 7);
4888 }
4889
4890 // CHECK-LABEL: @test_vld2_lane_s16(
4891 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
4892 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
4893 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
4894 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
4895 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4896 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4897 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4898 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
4899 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
4900 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4901 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
4902 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
4903 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
4904 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4905 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4906 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4907 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
4908 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
4909   return vld2_lane_s16(a, b, 3);
4910 }
4911
4912 // CHECK-LABEL: @test_vld2_lane_s32(
4913 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
4914 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
4915 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
4916 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
4917 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4918 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4919 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4920 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
4921 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
4922 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4923 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
4924 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
4925 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
4926 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4927 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4928 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4929 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
4930 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
4931   return vld2_lane_s32(a, b, 1);
4932 }
4933
4934 // CHECK-LABEL: @test_vld2_lane_f16(
4935 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
4936 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
4937 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
4938 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
4939 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4940 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4941 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4942 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
4943 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
4944 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4945 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
4946 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
4947 // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
4948 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
4949 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4950 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
4951 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half>
4952 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
4953   return vld2_lane_f16(a, b, 3);
4954 }
4955
4956 // CHECK-LABEL: @test_vld2_lane_f32(
4957 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
4958 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
4959 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
4960 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
4961 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4962 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4963 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4964 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
4965 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
4966 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4967 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
4968 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
4969 // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
4970 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
4971 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4972 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
4973 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float>
4974 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
4975   return vld2_lane_f32(a, b, 1);
4976 }
4977
4978 // CHECK-LABEL: @test_vld2_lane_p8(
4979 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
4980 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
4981 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
4982 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
4983 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
4984 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
4985 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4986 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
4987 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
4988 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
4989 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
4990 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
4991 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
4992 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
4993   return vld2_lane_p8(a, b, 7);
4994 }
4995
4996 // CHECK-LABEL: @test_vld2_lane_p16(
4997 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
4998 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
4999 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
5000 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
5001 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5002 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
5003 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
5004 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
5005 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5006 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5007 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
5008 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
5009 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5010 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5011 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5012 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5013 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
5014 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
5015   return vld2_lane_p16(a, b, 3);
5016 }
5017
5018 // CHECK-LABEL: @test_vld3q_u8(
5019 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
5020 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5021 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
5022   return vld3q_u8(a);
5023 }
5024
5025 // CHECK-LABEL: @test_vld3q_u16(
5026 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5027 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5028 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
5029   return vld3q_u16(a);
5030 }
5031
5032 // CHECK-LABEL: @test_vld3q_u32(
5033 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5034 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5035 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
5036   return vld3q_u32(a);
5037 }
5038
5039 // CHECK-LABEL: @test_vld3q_s8(
5040 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
5041 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5042 int8x16x3_t test_vld3q_s8(int8_t const * a) {
5043   return vld3q_s8(a);
5044 }
5045
5046 // CHECK-LABEL: @test_vld3q_s16(
5047 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
5048 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5049 int16x8x3_t test_vld3q_s16(int16_t const * a) {
5050   return vld3q_s16(a);
5051 }
5052
5053 // CHECK-LABEL: @test_vld3q_s32(
5054 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
5055 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5056 int32x4x3_t test_vld3q_s32(int32_t const * a) {
5057   return vld3q_s32(a);
5058 }
5059
5060 // CHECK-LABEL: @test_vld3q_f16(
5061 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
5062 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
5063 float16x8x3_t test_vld3q_f16(float16_t const * a) {
5064   return vld3q_f16(a);
5065 }
5066
5067 // CHECK-LABEL: @test_vld3q_f32(
5068 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
5069 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
5070 float32x4x3_t test_vld3q_f32(float32_t const * a) {
5071   return vld3q_f32(a);
5072 }
5073
5074 // CHECK-LABEL: @test_vld3q_p8(
5075 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
5076 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5077 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
5078   return vld3q_p8(a);
5079 }
5080
5081 // CHECK-LABEL: @test_vld3q_p16(
5082 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
5083 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5084 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
5085   return vld3q_p16(a);
5086 }
5087
5088 // CHECK-LABEL: @test_vld3_u8(
5089 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
5090 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5091 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
5092   return vld3_u8(a);
5093 }
5094
5095 // CHECK-LABEL: @test_vld3_u16(
5096 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
5097 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5098 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
5099   return vld3_u16(a);
5100 }
5101
5102 // CHECK-LABEL: @test_vld3_u32(
5103 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
5104 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5105 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
5106   return vld3_u32(a);
5107 }
5108
5109 // CHECK-LABEL: @test_vld3_u64(
5110 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
5111 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
5112 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
5113   return vld3_u64(a);
5114 }
5115
5116 // CHECK-LABEL: @test_vld3_s8(
5117 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
5118 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5119 int8x8x3_t test_vld3_s8(int8_t const * a) {
5120   return vld3_s8(a);
5121 }
5122
5123 // CHECK-LABEL: @test_vld3_s16(
5124 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
5125 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5126 int16x4x3_t test_vld3_s16(int16_t const * a) {
5127   return vld3_s16(a);
5128 }
5129
5130 // CHECK-LABEL: @test_vld3_s32(
5131 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
5132 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5133 int32x2x3_t test_vld3_s32(int32_t const * a) {
5134   return vld3_s32(a);
5135 }
5136
5137 // CHECK-LABEL: @test_vld3_s64(
5138 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
5139 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
5140 int64x1x3_t test_vld3_s64(int64_t const * a) {
5141   return vld3_s64(a);
5142 }
5143
5144 // CHECK-LABEL: @test_vld3_f16(
5145 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
5146 // CHECK:   [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
5147 float16x4x3_t test_vld3_f16(float16_t const * a) {
5148   return vld3_f16(a);
5149 }
5150
5151 // CHECK-LABEL: @test_vld3_f32(
5152 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
5153 // CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
5154 float32x2x3_t test_vld3_f32(float32_t const * a) {
5155   return vld3_f32(a);
5156 }
5157
5158 // CHECK-LABEL: @test_vld3_p8(
5159 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
5160 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5161 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
5162   return vld3_p8(a);
5163 }
5164
5165 // CHECK-LABEL: @test_vld3_p16(
5166 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
5167 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5168 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
5169   return vld3_p16(a);
5170 }
5171
5172 // CHECK-LABEL: @test_vld3q_lane_u16(
5173 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
5174 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5175 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
5176 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
5177 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5178 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5179 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
5180 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5181 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5182 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5183 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
5184 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5185 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5186 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5187 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
5188 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5189 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5190 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5191 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5192 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5193 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5194 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5195 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
5196   return vld3q_lane_u16(a, b, 7);
5197 }
5198
5199 // CHECK-LABEL: @test_vld3q_lane_u32(
5200 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
5201 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5202 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
5203 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
5204 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5205 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5206 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
5207 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
5208 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5209 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5210 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
5211 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
5212 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5213 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5214 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
5215 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
5216 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5217 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5218 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5219 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5220 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5221 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5222 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
5223   return vld3q_lane_u32(a, b, 3);
5224 }
5225
5226 // CHECK-LABEL: @test_vld3q_lane_s16(
5227 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
5228 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
5229 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
5230 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
5231 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5232 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5233 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
5234 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5235 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5236 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5237 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
5238 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5239 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5240 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5241 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
5242 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5243 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5244 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5245 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5246 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5247 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5248 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5249 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
5250   return vld3q_lane_s16(a, b, 7);
5251 }
5252
5253 // CHECK-LABEL: @test_vld3q_lane_s32(
5254 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
5255 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
5256 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
5257 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
5258 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5259 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5260 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
5261 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
5262 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5263 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5264 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
5265 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
5266 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5267 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5268 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
5269 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
5270 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5271 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5272 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5273 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5274 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5275 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5276 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
5277   return vld3q_lane_s32(a, b, 3);
5278 }
5279
5280 // CHECK-LABEL: @test_vld3q_lane_f16(
5281 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
5282 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
5283 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
5284 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
5285 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5286 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5287 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
5288 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
5289 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
5290 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5291 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
5292 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
5293 // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
5294 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5295 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
5296 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
5297 // CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
5298 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5299 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5300 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5301 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5302 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
5303 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
5304   return vld3q_lane_f16(a, b, 7);
5305 }
5306
5307 // CHECK-LABEL: @test_vld3q_lane_f32(
5308 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
5309 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
5310 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
5311 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
5312 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5313 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5314 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
5315 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
5316 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
5317 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5318 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
5319 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
5320 // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
5321 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5322 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
5323 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
5324 // CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
5325 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5326 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5327 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5328 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5329 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
5330 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
5331   return vld3q_lane_f32(a, b, 3);
5332 }
5333
5334 // CHECK-LABEL: @test_vld3q_lane_p16(
5335 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
5336 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
5337 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
5338 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
5339 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5340 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
5341 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
5342 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5343 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5344 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5345 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
5346 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5347 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5348 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5349 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
5350 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5351 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5352 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5353 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5354 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5355 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5356 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5357 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
5358   return vld3q_lane_p16(a, b, 7);
5359 }
5360
5361 // CHECK-LABEL: @test_vld3_lane_u8(
5362 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
5363 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
5364 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
5365 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
5366 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5367 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5368 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
5369 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
5370 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5371 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
5372 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
5373 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5374 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
5375 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
5376 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5377 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5378 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
5379   return vld3_lane_u8(a, b, 7);
5380 }
5381
5382 // CHECK-LABEL: @test_vld3_lane_u16(
5383 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
5384 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
5385 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
5386 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
5387 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5388 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5389 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
5390 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
5391 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5392 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5393 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
5394 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
5395 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5396 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5397 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
5398 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
5399 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5400 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5401 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5402 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5403 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5404 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5405 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
5406   return vld3_lane_u16(a, b, 3);
5407 }
5408
5409 // CHECK-LABEL: @test_vld3_lane_u32(
5410 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
5411 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
5412 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
5413 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
5414 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5415 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5416 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
5417 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
5418 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5419 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5420 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
5421 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
5422 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5423 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5424 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
5425 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
5426 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5427 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5428 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5429 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5430 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5431 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5432 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
5433   return vld3_lane_u32(a, b, 1);
5434 }
5435
5436 // CHECK-LABEL: @test_vld3_lane_s8(
5437 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
5438 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
5439 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
5440 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
5441 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5442 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5443 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
5444 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
5445 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5446 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
5447 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
5448 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5449 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
5450 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
5451 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5452 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5453 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
5454   return vld3_lane_s8(a, b, 7);
5455 }
5456
5457 // CHECK-LABEL: @test_vld3_lane_s16(
5458 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
5459 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
5460 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
5461 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
5462 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5463 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5464 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
5465 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
5466 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5467 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5468 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
5469 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
5470 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5471 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5472 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
5473 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
5474 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5475 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5476 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5477 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5478 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5479 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5480 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
5481   return vld3_lane_s16(a, b, 3);
5482 }
5483
5484 // CHECK-LABEL: @test_vld3_lane_s32(
5485 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
5486 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
5487 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
5488 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
5489 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5490 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5491 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
5492 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
5493 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
5494 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5495 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
5496 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
5497 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
5498 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5499 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
5500 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
5501 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
5502 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5503 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5504 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5505 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5506 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5507 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
5508   return vld3_lane_s32(a, b, 1);
5509 }
5510
5511 // CHECK-LABEL: @test_vld3_lane_f16(
5512 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
5513 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
5514 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
5515 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
5516 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5517 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5518 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
5519 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
5520 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
5521 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5522 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
5523 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
5524 // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
5525 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5526 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
5527 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
5528 // CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
5529 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
5530 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5531 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5532 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
5533 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
5534 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
5535   return vld3_lane_f16(a, b, 3);
5536 }
5537
5538 // CHECK-LABEL: @test_vld3_lane_f32(
5539 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
5540 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
5541 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
5542 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
5543 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5544 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5545 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
5546 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
5547 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
5548 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5549 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
5550 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
5551 // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
5552 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5553 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
5554 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
5555 // CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
5556 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
5557 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5558 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5559 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
5560 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
5561 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
5562   return vld3_lane_f32(a, b, 1);
5563 }
5564
5565 // CHECK-LABEL: @test_vld3_lane_p8(
5566 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
5567 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
5568 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
5569 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
5570 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5571 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5572 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
5573 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
5574 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
5575 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
5576 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
5577 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
5578 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
5579 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
5580 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
5581 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5582 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
5583   return vld3_lane_p8(a, b, 7);
5584 }
5585
5586 // CHECK-LABEL: @test_vld3_lane_p16(
5587 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
5588 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
5589 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
5590 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
5591 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5592 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
5593 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
5594 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
5595 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
5596 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5597 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
5598 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
5599 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
5600 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5601 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
5602 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
5603 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
5604 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5605 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5606 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5607 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5608 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5609 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
5610   return vld3_lane_p16(a, b, 3);
5611 }
5612
5613 // CHECK-LABEL: @test_vld4q_u8(
5614 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
5615 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5616 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
5617   return vld4q_u8(a);
5618 }
5619
5620 // CHECK-LABEL: @test_vld4q_u16(
5621 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
5622 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5623 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
5624   return vld4q_u16(a);
5625 }
5626
5627 // CHECK-LABEL: @test_vld4q_u32(
5628 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
5629 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5630 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
5631   return vld4q_u32(a);
5632 }
5633
5634 // CHECK-LABEL: @test_vld4q_s8(
5635 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
5636 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5637 int8x16x4_t test_vld4q_s8(int8_t const * a) {
5638   return vld4q_s8(a);
5639 }
5640
5641 // CHECK-LABEL: @test_vld4q_s16(
5642 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
5643 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5644 int16x8x4_t test_vld4q_s16(int16_t const * a) {
5645   return vld4q_s16(a);
5646 }
5647
5648 // CHECK-LABEL: @test_vld4q_s32(
5649 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
5650 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5651 int32x4x4_t test_vld4q_s32(int32_t const * a) {
5652   return vld4q_s32(a);
5653 }
5654
5655 // CHECK-LABEL: @test_vld4q_f16(
5656 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
5657 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
5658 float16x8x4_t test_vld4q_f16(float16_t const * a) {
5659   return vld4q_f16(a);
5660 }
5661
5662 // CHECK-LABEL: @test_vld4q_f32(
5663 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
5664 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
5665 float32x4x4_t test_vld4q_f32(float32_t const * a) {
5666   return vld4q_f32(a);
5667 }
5668
5669 // CHECK-LABEL: @test_vld4q_p8(
5670 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
5671 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5672 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
5673   return vld4q_p8(a);
5674 }
5675
5676 // CHECK-LABEL: @test_vld4q_p16(
5677 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
5678 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5679 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
5680   return vld4q_p16(a);
5681 }
5682
5683 // CHECK-LABEL: @test_vld4_u8(
5684 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
5685 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
5686 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
5687   return vld4_u8(a);
5688 }
5689
5690 // CHECK-LABEL: @test_vld4_u16(
5691 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
5692 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
5693 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
5694   return vld4_u16(a);
5695 }
5696
5697 // CHECK-LABEL: @test_vld4_u32(
5698 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
5699 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
5700 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
5701   return vld4_u32(a);
5702 }
5703
5704 // CHECK-LABEL: @test_vld4_u64(
5705 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
5706 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
5707 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
5708   return vld4_u64(a);
5709 }
5710
5711 // CHECK-LABEL: @test_vld4_s8(
5712 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
5713 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
5714 int8x8x4_t test_vld4_s8(int8_t const * a) {
5715   return vld4_s8(a);
5716 }
5717
5718 // CHECK-LABEL: @test_vld4_s16(
5719 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
5720 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
5721 int16x4x4_t test_vld4_s16(int16_t const * a) {
5722   return vld4_s16(a);
5723 }
5724
5725 // CHECK-LABEL: @test_vld4_s32(
5726 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
5727 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
5728 int32x2x4_t test_vld4_s32(int32_t const * a) {
5729   return vld4_s32(a);
5730 }
5731
5732 // CHECK-LABEL: @test_vld4_s64(
5733 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
5734 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
5735 int64x1x4_t test_vld4_s64(int64_t const * a) {
5736   return vld4_s64(a);
5737 }
5738
5739 // CHECK-LABEL: @test_vld4_f16(
5740 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
5741 // CHECK:   [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
5742 float16x4x4_t test_vld4_f16(float16_t const * a) {
5743   return vld4_f16(a);
5744 }
5745
5746 // CHECK-LABEL: @test_vld4_f32(
5747 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
5748 // CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
5749 float32x2x4_t test_vld4_f32(float32_t const * a) {
5750   return vld4_f32(a);
5751 }
5752
5753 // CHECK-LABEL: @test_vld4_p8(
5754 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
5755 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
5756 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
5757   return vld4_p8(a);
5758 }
5759
5760 // CHECK-LABEL: @test_vld4_p16(
5761 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
5762 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
5763 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
5764   return vld4_p16(a);
5765 }
5766
5767 // CHECK-LABEL: @test_vld4q_lane_u16(
5768 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
5769 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
5770 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
5771 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
5772 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5773 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5774 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
5775 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5776 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5777 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5778 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
5779 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5780 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5781 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5782 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
5783 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5784 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5785 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5786 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
5787 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
5788 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5789 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
5790 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5791 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5792 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5793 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
5794 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5795 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
5796   return vld4q_lane_u16(a, b, 7);
5797 }
5798
5799 // CHECK-LABEL: @test_vld4q_lane_u32(
5800 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
5801 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
5802 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
5803 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
5804 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5805 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5806 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
5807 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
5808 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5809 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5810 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
5811 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
5812 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5813 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5814 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
5815 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
5816 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5817 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5818 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
5819 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
5820 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
5821 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
5822 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5823 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5824 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5825 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
5826 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5827 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
5828   return vld4q_lane_u32(a, b, 3);
5829 }
5830
5831 // CHECK-LABEL: @test_vld4q_lane_s16(
5832 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
5833 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
5834 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
5835 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
5836 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5837 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5838 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5839 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5840 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5841 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5842 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5843 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5844 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5845 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5846 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5847 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5848 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5849 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5850 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
5851 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
5852 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5853 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
5854 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5855 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5856 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5857 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
5858 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5859 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
5860   return vld4q_lane_s16(a, b, 7);
5861 }
5862
5863 // CHECK-LABEL: @test_vld4q_lane_s32(
5864 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
5865 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
5866 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
5867 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
5868 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5869 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5870 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5871 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
5872 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
5873 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5874 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5875 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
5876 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
5877 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5878 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5879 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
5880 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
5881 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5882 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
5883 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
5884 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
5885 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
5886 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5887 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5888 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5889 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
5890 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5891 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
5892   return vld4q_lane_s32(a, b, 3);
5893 }
5894
5895 // CHECK-LABEL: @test_vld4q_lane_f16(
5896 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
5897 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
5898 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
5899 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
5900 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5901 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5902 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5903 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
5904 // CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
5905 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5906 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5907 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
5908 // CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
5909 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5910 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5911 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
5912 // CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
5913 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5914 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
5915 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
5916 // CHECK:   [[TMP11:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
5917 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
5918 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5919 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5920 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5921 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half>
5922 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
5923 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
5924   return vld4q_lane_f16(a, b, 7);
5925 }
5926
5927 // CHECK-LABEL: @test_vld4q_lane_f32(
5928 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
5929 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
5930 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
5931 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
5932 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5933 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5934 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5935 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
5936 // CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
5937 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5938 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5939 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
5940 // CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
5941 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5942 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5943 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
5944 // CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
5945 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5946 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
5947 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
5948 // CHECK:   [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
5949 // CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
5950 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5951 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5952 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5953 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
5954 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
5955 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
5956   return vld4q_lane_f32(a, b, 3);
5957 }
5958
5959 // CHECK-LABEL: @test_vld4q_lane_p16(
5960 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
5961 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
5962 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
5963 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
5964 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
5965 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
5966 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5967 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
5968 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
5969 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5970 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5971 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
5972 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
5973 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5974 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5975 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
5976 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
5977 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5978 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
5979 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
5980 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
5981 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
5982 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5983 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5984 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5985 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
5986 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5987 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
5988   return vld4q_lane_p16(a, b, 7);
5989 }
5990
5991 // CHECK-LABEL: @test_vld4_lane_u8(
5992 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
5993 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
5994 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
5995 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
5996 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
5997 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
5998 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
5999 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
6000 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
6001 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
6002 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
6003 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
6004 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
6005 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
6006 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
6007 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
6008 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
6009 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
6010 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6011 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
6012   return vld4_lane_u8(a, b, 7);
6013 }
6014
6015 // CHECK-LABEL: @test_vld4_lane_u16(
6016 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
6017 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
6018 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
6019 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
6020 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6021 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6022 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
6023 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
6024 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
6025 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6026 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
6027 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
6028 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
6029 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6030 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
6031 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
6032 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
6033 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6034 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
6035 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
6036 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
6037 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6038 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6039 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6040 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6041 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6042 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6043 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
6044   return vld4_lane_u16(a, b, 3);
6045 }
6046
6047 // CHECK-LABEL: @test_vld4_lane_u32(
6048 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
6049 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
6050 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
6051 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
6052 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6053 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6054 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
6055 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
6056 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
6057 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6058 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
6059 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
6060 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
6061 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6062 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
6063 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
6064 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
6065 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6066 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
6067 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
6068 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
6069 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
6070 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6071 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6072 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6073 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
6074 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
6075 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
6076   return vld4_lane_u32(a, b, 1);
6077 }
6078
6079 // CHECK-LABEL: @test_vld4_lane_s8(
6080 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
6081 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
6082 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
6083 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
6084 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6085 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6086 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
6087 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
6088 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
6089 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
6090 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
6091 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
6092 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
6093 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
6094 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
6095 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
6096 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
6097 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
6098 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6099 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
6100   return vld4_lane_s8(a, b, 7);
6101 }
6102
6103 // CHECK-LABEL: @test_vld4_lane_s16(
6104 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
6105 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
6106 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
6107 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
6108 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6109 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6110 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
6111 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
6112 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
6113 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6114 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
6115 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
6116 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
6117 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6118 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
6119 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
6120 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
6121 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6122 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
6123 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
6124 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
6125 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6126 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6127 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6128 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6129 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6130 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6131 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
6132   return vld4_lane_s16(a, b, 3);
6133 }
6134
6135 // CHECK-LABEL: @test_vld4_lane_s32(
6136 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
6137 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
6138 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
6139 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
6140 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6141 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6142 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
6143 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
6144 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
6145 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6146 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
6147 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
6148 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
6149 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6150 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
6151 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
6152 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
6153 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6154 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
6155 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
6156 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
6157 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
6158 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6159 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6160 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6161 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
6162 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
6163 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
6164   return vld4_lane_s32(a, b, 1);
6165 }
6166
6167 // CHECK-LABEL: @test_vld4_lane_f16(
6168 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
6169 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
6170 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
6171 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
6172 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6173 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6174 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
6175 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
6176 // CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
6177 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
6178 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
6179 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
6180 // CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
6181 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
6182 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
6183 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
6184 // CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
6185 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
6186 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
6187 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
6188 // CHECK:   [[TMP11:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
6189 // CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
6190 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
6191 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
6192 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
6193 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half>
6194 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
6195 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
6196   return vld4_lane_f16(a, b, 3);
6197 }
6198
6199 // CHECK-LABEL: @test_vld4_lane_f32(
6200 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
6201 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
6202 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
6203 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
6204 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6205 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6206 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
6207 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
6208 // CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
6209 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
6210 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
6211 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
6212 // CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
6213 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
6214 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
6215 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
6216 // CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
6217 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
6218 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
6219 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
6220 // CHECK:   [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
6221 // CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
6222 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
6223 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
6224 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
6225 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
6226 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
6227 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
6228   return vld4_lane_f32(a, b, 1);
6229 }
6230
6231 // CHECK-LABEL: @test_vld4_lane_p8(
6232 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
6233 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
6234 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
6235 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
6236 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6237 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6238 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
6239 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
6240 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
6241 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
6242 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
6243 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
6244 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
6245 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
6246 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
6247 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
6248 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
6249 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
6250 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6251 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
6252   return vld4_lane_p8(a, b, 7);
6253 }
6254
6255 // CHECK-LABEL: @test_vld4_lane_p16(
6256 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
6257 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
6258 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
6259 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
6260 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
6261 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
6262 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
6263 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
6264 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
6265 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6266 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
6267 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
6268 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
6269 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6270 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
6271 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
6272 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
6273 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6274 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
6275 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
6276 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
6277 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6278 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6279 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6280 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6281 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6282 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6283 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
6284   return vld4_lane_p16(a, b, 3);
6285 }
6286
6287 // CHECK-LABEL: @test_vmax_s8(
6288 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
6289 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
6290 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
6291   return vmax_s8(a, b);
6292 }
6293
6294 // CHECK-LABEL: @test_vmax_s16(
6295 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6296 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6297 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
6298 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
6299 // CHECK:   ret <4 x i16> [[VMAX_V2_I]]
6300 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
6301   return vmax_s16(a, b);
6302 }
6303
6304 // CHECK-LABEL: @test_vmax_s32(
6305 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6306 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6307 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
6308 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
6309 // CHECK:   ret <2 x i32> [[VMAX_V2_I]]
6310 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
6311   return vmax_s32(a, b);
6312 }
6313
6314 // CHECK-LABEL: @test_vmax_u8(
6315 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
6316 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
6317 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
6318   return vmax_u8(a, b);
6319 }
6320
6321 // CHECK-LABEL: @test_vmax_u16(
6322 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6323 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6324 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
6325 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
6326 // CHECK:   ret <4 x i16> [[VMAX_V2_I]]
6327 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
6328   return vmax_u16(a, b);
6329 }
6330
6331 // CHECK-LABEL: @test_vmax_u32(
6332 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6333 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6334 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
6335 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
6336 // CHECK:   ret <2 x i32> [[VMAX_V2_I]]
6337 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
6338   return vmax_u32(a, b);
6339 }
6340
6341 // CHECK-LABEL: @test_vmax_f32(
6342 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
6343 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
6344 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b)
6345 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
6346 // CHECK:   ret <2 x float> [[VMAX_V2_I]]
6347 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
6348   return vmax_f32(a, b);
6349 }
6350
6351 // CHECK-LABEL: @test_vmaxq_s8(
6352 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b)
6353 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
6354 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
6355   return vmaxq_s8(a, b);
6356 }
6357
6358 // CHECK-LABEL: @test_vmaxq_s16(
6359 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6360 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6361 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b)
6362 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
6363 // CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
6364 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
6365   return vmaxq_s16(a, b);
6366 }
6367
6368 // CHECK-LABEL: @test_vmaxq_s32(
6369 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6370 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6371 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b)
6372 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
6373 // CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
6374 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
6375   return vmaxq_s32(a, b);
6376 }
6377
6378 // CHECK-LABEL: @test_vmaxq_u8(
6379 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b)
6380 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
6381 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
6382   return vmaxq_u8(a, b);
6383 }
6384
6385 // CHECK-LABEL: @test_vmaxq_u16(
6386 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6387 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6388 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b)
6389 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
6390 // CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
6391 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
6392   return vmaxq_u16(a, b);
6393 }
6394
6395 // CHECK-LABEL: @test_vmaxq_u32(
6396 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6397 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6398 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b)
6399 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
6400 // CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
6401 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
6402   return vmaxq_u32(a, b);
6403 }
6404
6405 // CHECK-LABEL: @test_vmaxq_f32(
6406 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
6407 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
6408 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b)
6409 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
6410 // CHECK:   ret <4 x float> [[VMAXQ_V2_I]]
6411 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
6412   return vmaxq_f32(a, b);
6413 }
6414
6415 // CHECK-LABEL: @test_vmin_s8(
6416 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b)
6417 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
6418 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
6419   return vmin_s8(a, b);
6420 }
6421
6422 // CHECK-LABEL: @test_vmin_s16(
6423 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6424 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6425 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b)
6426 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
6427 // CHECK:   ret <4 x i16> [[VMIN_V2_I]]
6428 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
6429   return vmin_s16(a, b);
6430 }
6431
6432 // CHECK-LABEL: @test_vmin_s32(
6433 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6434 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6435 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b)
6436 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
6437 // CHECK:   ret <2 x i32> [[VMIN_V2_I]]
6438 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
6439   return vmin_s32(a, b);
6440 }
6441
6442 // CHECK-LABEL: @test_vmin_u8(
6443 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b)
6444 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
6445 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
6446   return vmin_u8(a, b);
6447 }
6448
6449 // CHECK-LABEL: @test_vmin_u16(
6450 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6451 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6452 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b)
6453 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
6454 // CHECK:   ret <4 x i16> [[VMIN_V2_I]]
6455 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
6456   return vmin_u16(a, b);
6457 }
6458
6459 // CHECK-LABEL: @test_vmin_u32(
6460 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6461 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6462 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b)
6463 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
6464 // CHECK:   ret <2 x i32> [[VMIN_V2_I]]
6465 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
6466   return vmin_u32(a, b);
6467 }
6468
6469 // CHECK-LABEL: @test_vmin_f32(
6470 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
6471 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
6472 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b)
6473 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
6474 // CHECK:   ret <2 x float> [[VMIN_V2_I]]
6475 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
6476   return vmin_f32(a, b);
6477 }
6478
6479 // CHECK-LABEL: @test_vminq_s8(
6480 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b)
6481 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
6482 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
6483   return vminq_s8(a, b);
6484 }
6485
6486 // CHECK-LABEL: @test_vminq_s16(
6487 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6488 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6489 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b)
6490 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
6491 // CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
6492 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
6493   return vminq_s16(a, b);
6494 }
6495
6496 // CHECK-LABEL: @test_vminq_s32(
6497 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6498 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6499 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b)
6500 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
6501 // CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
6502 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
6503   return vminq_s32(a, b);
6504 }
6505
6506 // CHECK-LABEL: @test_vminq_u8(
6507 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b)
6508 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
6509 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
6510   return vminq_u8(a, b);
6511 }
6512
6513 // CHECK-LABEL: @test_vminq_u16(
6514 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6515 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6516 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b)
6517 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
6518 // CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
6519 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
6520   return vminq_u16(a, b);
6521 }
6522
6523 // CHECK-LABEL: @test_vminq_u32(
6524 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6525 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6526 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b)
6527 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
6528 // CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
6529 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
6530   return vminq_u32(a, b);
6531 }
6532
6533 // CHECK-LABEL: @test_vminq_f32(
6534 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
6535 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
6536 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b)
6537 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
6538 // CHECK:   ret <4 x float> [[VMINQ_V2_I]]
6539 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
6540   return vminq_f32(a, b);
6541 }
6542
6543 // CHECK-LABEL: @test_vmla_s8(
6544 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
6545 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
6546 // CHECK:   ret <8 x i8> [[ADD_I]]
6547 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
6548   return vmla_s8(a, b, c);
6549 }
6550
6551 // CHECK-LABEL: @test_vmla_s16(
6552 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
6553 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6554 // CHECK:   ret <4 x i16> [[ADD_I]]
6555 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
6556   return vmla_s16(a, b, c);
6557 }
6558
6559 // CHECK-LABEL: @test_vmla_s32(
6560 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
6561 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6562 // CHECK:   ret <2 x i32> [[ADD_I]]
6563 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
6564   return vmla_s32(a, b, c);
6565 }
6566
6567 // CHECK-LABEL: @test_vmla_f32(
6568 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
6569 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
6570 // CHECK:   ret <2 x float> [[ADD_I]]
6571 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
6572   return vmla_f32(a, b, c);
6573 }
6574
6575 // CHECK-LABEL: @test_vmla_u8(
6576 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
6577 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
6578 // CHECK:   ret <8 x i8> [[ADD_I]]
6579 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
6580   return vmla_u8(a, b, c);
6581 }
6582
6583 // CHECK-LABEL: @test_vmla_u16(
6584 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
6585 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6586 // CHECK:   ret <4 x i16> [[ADD_I]]
6587 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
6588   return vmla_u16(a, b, c);
6589 }
6590
6591 // CHECK-LABEL: @test_vmla_u32(
6592 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
6593 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6594 // CHECK:   ret <2 x i32> [[ADD_I]]
6595 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
6596   return vmla_u32(a, b, c);
6597 }
6598
6599 // CHECK-LABEL: @test_vmlaq_s8(
6600 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
6601 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
6602 // CHECK:   ret <16 x i8> [[ADD_I]]
6603 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
6604   return vmlaq_s8(a, b, c);
6605 }
6606
6607 // CHECK-LABEL: @test_vmlaq_s16(
6608 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
6609 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
6610 // CHECK:   ret <8 x i16> [[ADD_I]]
6611 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
6612   return vmlaq_s16(a, b, c);
6613 }
6614
6615 // CHECK-LABEL: @test_vmlaq_s32(
6616 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
6617 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
6618 // CHECK:   ret <4 x i32> [[ADD_I]]
6619 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
6620   return vmlaq_s32(a, b, c);
6621 }
6622
6623 // CHECK-LABEL: @test_vmlaq_f32(
6624 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
6625 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
6626 // CHECK:   ret <4 x float> [[ADD_I]]
6627 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
6628   return vmlaq_f32(a, b, c);
6629 }
6630
6631 // CHECK-LABEL: @test_vmlaq_u8(
6632 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
6633 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
6634 // CHECK:   ret <16 x i8> [[ADD_I]]
6635 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
6636   return vmlaq_u8(a, b, c);
6637 }
6638
6639 // CHECK-LABEL: @test_vmlaq_u16(
6640 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
6641 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
6642 // CHECK:   ret <8 x i16> [[ADD_I]]
6643 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
6644   return vmlaq_u16(a, b, c);
6645 }
6646
6647 // CHECK-LABEL: @test_vmlaq_u32(
6648 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
6649 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
6650 // CHECK:   ret <4 x i32> [[ADD_I]]
6651 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
6652   return vmlaq_u32(a, b, c);
6653 }
6654
6655 // CHECK-LABEL: @test_vmlal_s8(
6656 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
6657 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
6658 // CHECK:   ret <8 x i16> [[ADD_I]]
6659 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
6660   return vmlal_s8(a, b, c);
6661 }
6662
6663 // CHECK-LABEL: @test_vmlal_s16(
6664 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6665 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
6666 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
6667 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
6668 // CHECK:   ret <4 x i32> [[ADD_I]]
6669 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
6670   return vmlal_s16(a, b, c);
6671 }
6672
6673 // CHECK-LABEL: @test_vmlal_s32(
6674 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6675 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
6676 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
6677 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
6678 // CHECK:   ret <2 x i64> [[ADD_I]]
6679 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
6680   return vmlal_s32(a, b, c);
6681 }
6682
6683 // CHECK-LABEL: @test_vmlal_u8(
6684 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
6685 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
6686 // CHECK:   ret <8 x i16> [[ADD_I]]
6687 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
6688   return vmlal_u8(a, b, c);
6689 }
6690
6691 // CHECK-LABEL: @test_vmlal_u16(
6692 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6693 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
6694 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
6695 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
6696 // CHECK:   ret <4 x i32> [[ADD_I]]
6697 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
6698   return vmlal_u16(a, b, c);
6699 }
6700
6701 // CHECK-LABEL: @test_vmlal_u32(
6702 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6703 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
6704 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
6705 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
6706 // CHECK:   ret <2 x i64> [[ADD_I]]
6707 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
6708   return vmlal_u32(a, b, c);
6709 }
6710
6711 // CHECK-LABEL: @test_vmlal_lane_s16(
6712 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6713 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6714 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
6715 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
6716 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
6717 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
6718 // CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
6719 // CHECK:   ret <4 x i32> [[ADD]]
6720 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
6721   return vmlal_lane_s16(a, b, c, 3);
6722 }
6723
6724 // CHECK-LABEL: @test_vmlal_lane_s32(
6725 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6726 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6727 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
6728 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
6729 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
6730 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
6731 // CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
6732 // CHECK:   ret <2 x i64> [[ADD]]
6733 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
6734   return vmlal_lane_s32(a, b, c, 1);
6735 }
6736
6737 // CHECK-LABEL: @test_vmlal_lane_u16(
6738 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6739 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6740 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
6741 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
6742 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
6743 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
6744 // CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
6745 // CHECK:   ret <4 x i32> [[ADD]]
6746 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
6747   return vmlal_lane_u16(a, b, c, 3);
6748 }
6749
6750 // CHECK-LABEL: @test_vmlal_lane_u32(
6751 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6752 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6753 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
6754 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
6755 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
6756 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
6757 // CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
6758 // CHECK:   ret <2 x i64> [[ADD]]
6759 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
6760   return vmlal_lane_u32(a, b, c, 1);
6761 }
6762
6763 // CHECK-LABEL: @test_vmlal_n_s16(
6764 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
6765 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
6766 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
6767 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
6768 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6769 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
6770 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
6771 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
6772 // CHECK:   ret <4 x i32> [[ADD_I]]
6773 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
6774   return vmlal_n_s16(a, b, c);
6775 }
6776
6777 // CHECK-LABEL: @test_vmlal_n_s32(
6778 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
6779 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
6780 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6781 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
6782 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
6783 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
6784 // CHECK:   ret <2 x i64> [[ADD_I]]
6785 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
6786   return vmlal_n_s32(a, b, c);
6787 }
6788
6789 // CHECK-LABEL: @test_vmlal_n_u16(
6790 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
6791 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
6792 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
6793 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
6794 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6795 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
6796 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
6797 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
6798 // CHECK:   ret <4 x i32> [[ADD_I]]
6799 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
6800   return vmlal_n_u16(a, b, c);
6801 }
6802
6803 // CHECK-LABEL: @test_vmlal_n_u32(
6804 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
6805 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
6806 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6807 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
6808 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
6809 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
6810 // CHECK:   ret <2 x i64> [[ADD_I]]
6811 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
6812   return vmlal_n_u32(a, b, c);
6813 }
6814
6815 // CHECK-LABEL: @test_vmla_lane_s16(
6816 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6817 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6818 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
6819 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
6820 // CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
6821 // CHECK:   ret <4 x i16> [[ADD]]
6822 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
6823   return vmla_lane_s16(a, b, c, 3);
6824 }
6825
6826 // CHECK-LABEL: @test_vmla_lane_s32(
6827 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6828 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6829 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
6830 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
6831 // CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
6832 // CHECK:   ret <2 x i32> [[ADD]]
6833 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
6834   return vmla_lane_s32(a, b, c, 1);
6835 }
6836
6837 // CHECK-LABEL: @test_vmla_lane_u16(
6838 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6839 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6840 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
6841 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
6842 // CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
6843 // CHECK:   ret <4 x i16> [[ADD]]
6844 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
6845   return vmla_lane_u16(a, b, c, 3);
6846 }
6847
6848 // CHECK-LABEL: @test_vmla_lane_u32(
6849 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6850 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6851 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
6852 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
6853 // CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
6854 // CHECK:   ret <2 x i32> [[ADD]]
6855 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
6856   return vmla_lane_u32(a, b, c, 1);
6857 }
6858
6859 // CHECK-LABEL: @test_vmla_lane_f32(
6860 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
6861 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
6862 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
6863 // CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
6864 // CHECK:   [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
6865 // CHECK:   ret <2 x float> [[ADD]]
6866 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
6867   return vmla_lane_f32(a, b, c, 1);
6868 }
6869
6870 // CHECK-LABEL: @test_vmlaq_lane_s16(
6871 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6872 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6873 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
6874 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
6875 // CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
6876 // CHECK:   ret <8 x i16> [[ADD]]
6877 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
6878   return vmlaq_lane_s16(a, b, c, 3);
6879 }
6880
6881 // CHECK-LABEL: @test_vmlaq_lane_s32(
6882 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6883 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6884 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
6885 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
6886 // CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
6887 // CHECK:   ret <4 x i32> [[ADD]]
6888 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
6889   return vmlaq_lane_s32(a, b, c, 1);
6890 }
6891
6892 // CHECK-LABEL: @test_vmlaq_lane_u16(
6893 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
6894 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
6895 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
6896 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
6897 // CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
6898 // CHECK:   ret <8 x i16> [[ADD]]
6899 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
6900   return vmlaq_lane_u16(a, b, c, 3);
6901 }
6902
6903 // CHECK-LABEL: @test_vmlaq_lane_u32(
6904 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
6905 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
6906 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
6907 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
6908 // CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
6909 // CHECK:   ret <4 x i32> [[ADD]]
6910 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
6911   return vmlaq_lane_u32(a, b, c, 1);
6912 }
6913
6914 // CHECK-LABEL: @test_vmlaq_lane_f32(
6915 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
6916 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
6917 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
6918 // CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
6919 // CHECK:   [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
6920 // CHECK:   ret <4 x float> [[ADD]]
6921 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
6922   return vmlaq_lane_f32(a, b, c, 1);
6923 }
6924
6925 // CHECK-LABEL: @test_vmla_n_s16(
6926 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
6927 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
6928 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
6929 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
6930 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
6931 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6932 // CHECK:   ret <4 x i16> [[ADD_I]]
6933 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
6934   return vmla_n_s16(a, b, c);
6935 }
6936
6937 // CHECK-LABEL: @test_vmla_n_s32(
6938 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
6939 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
6940 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
6941 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6942 // CHECK:   ret <2 x i32> [[ADD_I]]
6943 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
6944   return vmla_n_s32(a, b, c);
6945 }
6946
6947 // CHECK-LABEL: @test_vmla_n_u16(
6948 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
6949 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
6950 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
6951 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
6952 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
6953 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6954 // CHECK:   ret <4 x i16> [[ADD_I]]
6955 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
6956   return vmla_n_u16(a, b, c);
6957 }
6958
6959 // CHECK-LABEL: @test_vmla_n_u32(
6960 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
6961 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
6962 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
6963 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6964 // CHECK:   ret <2 x i32> [[ADD_I]]
6965 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
6966   return vmla_n_u32(a, b, c);
6967 }
6968
6969 // CHECK-LABEL: @test_vmla_n_f32(
6970 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
6971 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
6972 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
6973 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
6974 // CHECK:   ret <2 x float> [[ADD_I]]
6975 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
6976   return vmla_n_f32(a, b, c);
6977 }
6978
6979 // CHECK-LABEL: @test_vmlaq_n_s16(
6980 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
6981 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
6982 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
6983 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
6984 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
6985 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
6986 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
6987 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
6988 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
6989 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
6990 // CHECK:   ret <8 x i16> [[ADD_I]]
6991 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
6992   return vmlaq_n_s16(a, b, c);
6993 }
6994
6995 // CHECK-LABEL: @test_vmlaq_n_s32(
6996 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
6997 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
6998 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
6999 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7000 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7001 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7002 // CHECK:   ret <4 x i32> [[ADD_I]]
7003 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
7004   return vmlaq_n_s32(a, b, c);
7005 }
7006
7007 // CHECK-LABEL: @test_vmlaq_n_u16(
7008 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7009 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7010 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7011 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7012 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7013 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7014 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7015 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7016 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7017 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
7018 // CHECK:   ret <8 x i16> [[ADD_I]]
7019 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
7020   return vmlaq_n_u16(a, b, c);
7021 }
7022
7023 // CHECK-LABEL: @test_vmlaq_n_u32(
7024 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7025 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7026 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7027 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7028 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7029 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7030 // CHECK:   ret <4 x i32> [[ADD_I]]
7031 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
7032   return vmlaq_n_u32(a, b, c);
7033 }
7034
7035 // CHECK-LABEL: @test_vmlaq_n_f32(
7036 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
7037 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
7038 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
7039 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
7040 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
7041 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
7042 // CHECK:   ret <4 x float> [[ADD_I]]
7043 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
7044   return vmlaq_n_f32(a, b, c);
7045 }
7046
7047 // CHECK-LABEL: @test_vmls_s8(
7048 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
7049 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
7050 // CHECK:   ret <8 x i8> [[SUB_I]]
7051 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
7052   return vmls_s8(a, b, c);
7053 }
7054
7055 // CHECK-LABEL: @test_vmls_s16(
7056 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
7057 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7058 // CHECK:   ret <4 x i16> [[SUB_I]]
7059 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
7060   return vmls_s16(a, b, c);
7061 }
7062
7063 // CHECK-LABEL: @test_vmls_s32(
7064 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
7065 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7066 // CHECK:   ret <2 x i32> [[SUB_I]]
7067 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
7068   return vmls_s32(a, b, c);
7069 }
7070
7071 // CHECK-LABEL: @test_vmls_f32(
7072 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
7073 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
7074 // CHECK:   ret <2 x float> [[SUB_I]]
7075 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
7076   return vmls_f32(a, b, c);
7077 }
7078
7079 // CHECK-LABEL: @test_vmls_u8(
7080 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
7081 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
7082 // CHECK:   ret <8 x i8> [[SUB_I]]
7083 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
7084   return vmls_u8(a, b, c);
7085 }
7086
7087 // CHECK-LABEL: @test_vmls_u16(
7088 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
7089 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7090 // CHECK:   ret <4 x i16> [[SUB_I]]
7091 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
7092   return vmls_u16(a, b, c);
7093 }
7094
7095 // CHECK-LABEL: @test_vmls_u32(
7096 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
7097 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7098 // CHECK:   ret <2 x i32> [[SUB_I]]
7099 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
7100   return vmls_u32(a, b, c);
7101 }
7102
7103 // CHECK-LABEL: @test_vmlsq_s8(
7104 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
7105 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
7106 // CHECK:   ret <16 x i8> [[SUB_I]]
7107 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
7108   return vmlsq_s8(a, b, c);
7109 }
7110
7111 // CHECK-LABEL: @test_vmlsq_s16(
7112 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7113 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7114 // CHECK:   ret <8 x i16> [[SUB_I]]
7115 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
7116   return vmlsq_s16(a, b, c);
7117 }
7118
7119 // CHECK-LABEL: @test_vmlsq_s32(
7120 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7121 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7122 // CHECK:   ret <4 x i32> [[SUB_I]]
7123 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
7124   return vmlsq_s32(a, b, c);
7125 }
7126
7127 // CHECK-LABEL: @test_vmlsq_f32(
7128 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
7129 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
7130 // CHECK:   ret <4 x float> [[SUB_I]]
7131 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
7132   return vmlsq_f32(a, b, c);
7133 }
7134
7135 // CHECK-LABEL: @test_vmlsq_u8(
7136 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
7137 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
7138 // CHECK:   ret <16 x i8> [[SUB_I]]
7139 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
7140   return vmlsq_u8(a, b, c);
7141 }
7142
7143 // CHECK-LABEL: @test_vmlsq_u16(
7144 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7145 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7146 // CHECK:   ret <8 x i16> [[SUB_I]]
7147 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
7148   return vmlsq_u16(a, b, c);
7149 }
7150
7151 // CHECK-LABEL: @test_vmlsq_u32(
7152 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7153 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7154 // CHECK:   ret <4 x i32> [[SUB_I]]
7155 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
7156   return vmlsq_u32(a, b, c);
7157 }
7158
7159 // CHECK-LABEL: @test_vmlsl_s8(
7160 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
7161 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
7162 // CHECK:   ret <8 x i16> [[SUB_I]]
7163 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
7164   return vmlsl_s8(a, b, c);
7165 }
7166
7167 // CHECK-LABEL: @test_vmlsl_s16(
7168 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7169 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7170 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
7171 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7172 // CHECK:   ret <4 x i32> [[SUB_I]]
7173 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7174   return vmlsl_s16(a, b, c);
7175 }
7176
7177 // CHECK-LABEL: @test_vmlsl_s32(
7178 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7179 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7180 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
7181 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7182 // CHECK:   ret <2 x i64> [[SUB_I]]
7183 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7184   return vmlsl_s32(a, b, c);
7185 }
7186
7187 // CHECK-LABEL: @test_vmlsl_u8(
7188 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
7189 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
7190 // CHECK:   ret <8 x i16> [[SUB_I]]
7191 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
7192   return vmlsl_u8(a, b, c);
7193 }
7194
7195 // CHECK-LABEL: @test_vmlsl_u16(
7196 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7197 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7198 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
7199 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7200 // CHECK:   ret <4 x i32> [[SUB_I]]
7201 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7202   return vmlsl_u16(a, b, c);
7203 }
7204
7205 // CHECK-LABEL: @test_vmlsl_u32(
7206 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7207 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7208 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
7209 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7210 // CHECK:   ret <2 x i64> [[SUB_I]]
7211 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7212   return vmlsl_u32(a, b, c);
7213 }
7214
7215 // CHECK-LABEL: @test_vmlsl_lane_s16(
7216 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7217 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7218 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7219 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
7220 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
7221 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
7222 // CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
7223 // CHECK:   ret <4 x i32> [[SUB]]
7224 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7225   return vmlsl_lane_s16(a, b, c, 3);
7226 }
7227
7228 // CHECK-LABEL: @test_vmlsl_lane_s32(
7229 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7230 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7231 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
7232 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
7233 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
7234 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
7235 // CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
7236 // CHECK:   ret <2 x i64> [[SUB]]
7237 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7238   return vmlsl_lane_s32(a, b, c, 1);
7239 }
7240
7241 // CHECK-LABEL: @test_vmlsl_lane_u16(
7242 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7243 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7244 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7245 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
7246 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
7247 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
7248 // CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
7249 // CHECK:   ret <4 x i32> [[SUB]]
7250 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7251   return vmlsl_lane_u16(a, b, c, 3);
7252 }
7253
7254 // CHECK-LABEL: @test_vmlsl_lane_u32(
7255 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7256 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7257 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
7258 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
7259 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
7260 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
7261 // CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
7262 // CHECK:   ret <2 x i64> [[SUB]]
7263 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7264   return vmlsl_lane_u32(a, b, c, 1);
7265 }
7266
7267 // CHECK-LABEL: @test_vmlsl_n_s16(
7268 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7269 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7270 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7271 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7272 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7273 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7274 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7275 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7276 // CHECK:   ret <4 x i32> [[SUB_I]]
7277 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
7278   return vmlsl_n_s16(a, b, c);
7279 }
7280
7281 // CHECK-LABEL: @test_vmlsl_n_s32(
7282 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7283 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7284 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7285 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7286 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7287 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7288 // CHECK:   ret <2 x i64> [[SUB_I]]
7289 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
7290   return vmlsl_n_s32(a, b, c);
7291 }
7292
7293 // CHECK-LABEL: @test_vmlsl_n_u16(
7294 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7295 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7296 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7297 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7298 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7299 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7300 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7301 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7302 // CHECK:   ret <4 x i32> [[SUB_I]]
7303 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
7304   return vmlsl_n_u16(a, b, c);
7305 }
7306
7307 // CHECK-LABEL: @test_vmlsl_n_u32(
7308 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7309 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7310 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7311 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7312 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7313 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7314 // CHECK:   ret <2 x i64> [[SUB_I]]
7315 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
7316   return vmlsl_n_u32(a, b, c);
7317 }
7318
7319 // CHECK-LABEL: @test_vmls_lane_s16(
7320 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7321 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7322 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7323 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
7324 // CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
7325 // CHECK:   ret <4 x i16> [[SUB]]
7326 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
7327   return vmls_lane_s16(a, b, c, 3);
7328 }
7329
7330 // CHECK-LABEL: @test_vmls_lane_s32(
7331 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7332 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7333 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
7334 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
7335 // CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
7336 // CHECK:   ret <2 x i32> [[SUB]]
7337 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
7338   return vmls_lane_s32(a, b, c, 1);
7339 }
7340
7341 // CHECK-LABEL: @test_vmls_lane_u16(
7342 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7343 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7344 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7345 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
7346 // CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
7347 // CHECK:   ret <4 x i16> [[SUB]]
7348 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
7349   return vmls_lane_u16(a, b, c, 3);
7350 }
7351
7352 // CHECK-LABEL: @test_vmls_lane_u32(
7353 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7354 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7355 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
7356 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
7357 // CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
7358 // CHECK:   ret <2 x i32> [[SUB]]
7359 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
7360   return vmls_lane_u32(a, b, c, 1);
7361 }
7362
7363 // CHECK-LABEL: @test_vmls_lane_f32(
7364 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
7365 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
7366 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
7367 // CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
7368 // CHECK:   [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
7369 // CHECK:   ret <2 x float> [[SUB]]
7370 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
7371   return vmls_lane_f32(a, b, c, 1);
7372 }
7373
7374 // CHECK-LABEL: @test_vmlsq_lane_s16(
7375 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7377 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7378 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
7379 // CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
7380 // CHECK:   ret <8 x i16> [[SUB]]
7381 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
7382   return vmlsq_lane_s16(a, b, c, 3);
7383 }
7384
7385 // CHECK-LABEL: @test_vmlsq_lane_s32(
7386 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7387 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7388 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7389 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
7390 // CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
7391 // CHECK:   ret <4 x i32> [[SUB]]
7392 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
7393   return vmlsq_lane_s32(a, b, c, 1);
7394 }
7395
7396 // CHECK-LABEL: @test_vmlsq_lane_u16(
7397 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
7398 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
7399 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7400 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
7401 // CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
7402 // CHECK:   ret <8 x i16> [[SUB]]
7403 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
7404   return vmlsq_lane_u16(a, b, c, 3);
7405 }
7406
7407 // CHECK-LABEL: @test_vmlsq_lane_u32(
7408 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
7409 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
7410 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7411 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
7412 // CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
7413 // CHECK:   ret <4 x i32> [[SUB]]
7414 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
7415   return vmlsq_lane_u32(a, b, c, 1);
7416 }
7417
7418 // CHECK-LABEL: @test_vmlsq_lane_f32(
7419 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
7420 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
7421 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7422 // CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
7423 // CHECK:   [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
7424 // CHECK:   ret <4 x float> [[SUB]]
7425 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
7426   return vmlsq_lane_f32(a, b, c, 1);
7427 }
7428
7429 // CHECK-LABEL: @test_vmls_n_s16(
7430 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7431 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7432 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7433 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7434 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7435 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7436 // CHECK:   ret <4 x i16> [[SUB_I]]
7437 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
7438   return vmls_n_s16(a, b, c);
7439 }
7440
7441 // CHECK-LABEL: @test_vmls_n_s32(
7442 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7443 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7444 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7445 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7446 // CHECK:   ret <2 x i32> [[SUB_I]]
7447 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
7448   return vmls_n_s32(a, b, c);
7449 }
7450
7451 // CHECK-LABEL: @test_vmls_n_u16(
7452 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7453 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7454 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7455 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7456 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7457 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7458 // CHECK:   ret <4 x i16> [[SUB_I]]
7459 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
7460   return vmls_n_u16(a, b, c);
7461 }
7462
7463 // CHECK-LABEL: @test_vmls_n_u32(
7464 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7465 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7466 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7467 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7468 // CHECK:   ret <2 x i32> [[SUB_I]]
7469 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
7470   return vmls_n_u32(a, b, c);
7471 }
7472
7473 // CHECK-LABEL: @test_vmls_n_f32(
7474 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
7475 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
7476 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
7477 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
7478 // CHECK:   ret <2 x float> [[SUB_I]]
7479 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
7480   return vmls_n_f32(a, b, c);
7481 }
7482
7483 // CHECK-LABEL: @test_vmlsq_n_s16(
7484 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7485 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7486 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7487 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7488 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7489 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7490 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7491 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7492 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7493 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7494 // CHECK:   ret <8 x i16> [[SUB_I]]
7495 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
7496   return vmlsq_n_s16(a, b, c);
7497 }
7498
7499 // CHECK-LABEL: @test_vmlsq_n_s32(
7500 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7501 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7502 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7503 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7504 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7505 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7506 // CHECK:   ret <4 x i32> [[SUB_I]]
7507 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
7508   return vmlsq_n_s32(a, b, c);
7509 }
7510
7511 // CHECK-LABEL: @test_vmlsq_n_u16(
7512 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7513 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7514 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7515 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7516 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7517 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7518 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7519 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7520 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7521 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7522 // CHECK:   ret <8 x i16> [[SUB_I]]
7523 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
7524   return vmlsq_n_u16(a, b, c);
7525 }
7526
7527 // CHECK-LABEL: @test_vmlsq_n_u32(
7528 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7529 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7530 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7531 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7532 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7533 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7534 // CHECK:   ret <4 x i32> [[SUB_I]]
7535 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
7536   return vmlsq_n_u32(a, b, c);
7537 }
7538
7539 // CHECK-LABEL: @test_vmlsq_n_f32(
7540 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
7541 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
7542 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
7543 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
7544 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
7545 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
7546 // CHECK:   ret <4 x float> [[SUB_I]]
7547 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
7548   return vmlsq_n_f32(a, b, c);
7549 }
7550
7551 // CHECK-LABEL: @test_vmovl_s8(
7552 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
7553 // CHECK:   ret <8 x i16> [[VMOVL_I]]
7554 int16x8_t test_vmovl_s8(int8x8_t a) {
7555   return vmovl_s8(a);
7556 }
7557
7558 // CHECK-LABEL: @test_vmovl_s16(
7559 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
7560 // CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
7561 // CHECK:   ret <4 x i32> [[VMOVL_I]]
7562 int32x4_t test_vmovl_s16(int16x4_t a) {
7563   return vmovl_s16(a);
7564 }
7565
7566 // CHECK-LABEL: @test_vmovl_s32(
7567 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
7568 // CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
7569 // CHECK:   ret <2 x i64> [[VMOVL_I]]
7570 int64x2_t test_vmovl_s32(int32x2_t a) {
7571   return vmovl_s32(a);
7572 }
7573
7574 // CHECK-LABEL: @test_vmovl_u8(
7575 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
7576 // CHECK:   ret <8 x i16> [[VMOVL_I]]
7577 uint16x8_t test_vmovl_u8(uint8x8_t a) {
7578   return vmovl_u8(a);
7579 }
7580
7581 // CHECK-LABEL: @test_vmovl_u16(
7582 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
7583 // CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
7584 // CHECK:   ret <4 x i32> [[VMOVL_I]]
7585 uint32x4_t test_vmovl_u16(uint16x4_t a) {
7586   return vmovl_u16(a);
7587 }
7588
7589 // CHECK-LABEL: @test_vmovl_u32(
7590 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
7591 // CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
7592 // CHECK:   ret <2 x i64> [[VMOVL_I]]
7593 uint64x2_t test_vmovl_u32(uint32x2_t a) {
7594   return vmovl_u32(a);
7595 }
7596
7597 // CHECK-LABEL: @test_vmovn_s16(
7598 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
7599 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
7600 // CHECK:   ret <8 x i8> [[VMOVN_I]]
7601 int8x8_t test_vmovn_s16(int16x8_t a) {
7602   return vmovn_s16(a);
7603 }
7604
7605 // CHECK-LABEL: @test_vmovn_s32(
7606 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
7607 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
7608 // CHECK:   ret <4 x i16> [[VMOVN_I]]
7609 int16x4_t test_vmovn_s32(int32x4_t a) {
7610   return vmovn_s32(a);
7611 }
7612
7613 // CHECK-LABEL: @test_vmovn_s64(
7614 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
7615 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
7616 // CHECK:   ret <2 x i32> [[VMOVN_I]]
7617 int32x2_t test_vmovn_s64(int64x2_t a) {
7618   return vmovn_s64(a);
7619 }
7620
7621 // CHECK-LABEL: @test_vmovn_u16(
7622 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
7623 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
7624 // CHECK:   ret <8 x i8> [[VMOVN_I]]
7625 uint8x8_t test_vmovn_u16(uint16x8_t a) {
7626   return vmovn_u16(a);
7627 }
7628
7629 // CHECK-LABEL: @test_vmovn_u32(
7630 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
7631 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
7632 // CHECK:   ret <4 x i16> [[VMOVN_I]]
7633 uint16x4_t test_vmovn_u32(uint32x4_t a) {
7634   return vmovn_u32(a);
7635 }
7636
7637 // CHECK-LABEL: @test_vmovn_u64(
7638 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
7639 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
7640 // CHECK:   ret <2 x i32> [[VMOVN_I]]
7641 uint32x2_t test_vmovn_u64(uint64x2_t a) {
7642   return vmovn_u64(a);
7643 }
7644
7645 // CHECK-LABEL: @test_vmov_n_u8(
7646 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
7647 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
7648 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
7649 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
7650 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
7651 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
7652 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
7653 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
7654 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
7655 uint8x8_t test_vmov_n_u8(uint8_t a) {
7656   return vmov_n_u8(a);
7657 }
7658
7659 // CHECK-LABEL: @test_vmov_n_u16(
7660 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
7661 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
7662 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
7663 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
7664 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
7665 uint16x4_t test_vmov_n_u16(uint16_t a) {
7666   return vmov_n_u16(a);
7667 }
7668
7669 // CHECK-LABEL: @test_vmov_n_u32(
7670 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
7671 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
7672 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
7673 uint32x2_t test_vmov_n_u32(uint32_t a) {
7674   return vmov_n_u32(a);
7675 }
7676
7677 // CHECK-LABEL: @test_vmov_n_s8(
7678 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
7679 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
7680 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
7681 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
7682 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
7683 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
7684 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
7685 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
7686 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
7687 int8x8_t test_vmov_n_s8(int8_t a) {
7688   return vmov_n_s8(a);
7689 }
7690
7691 // CHECK-LABEL: @test_vmov_n_s16(
7692 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
7693 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
7694 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
7695 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
7696 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
7697 int16x4_t test_vmov_n_s16(int16_t a) {
7698   return vmov_n_s16(a);
7699 }
7700
7701 // CHECK-LABEL: @test_vmov_n_s32(
7702 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
7703 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
7704 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
7705 int32x2_t test_vmov_n_s32(int32_t a) {
7706   return vmov_n_s32(a);
7707 }
7708
7709 // CHECK-LABEL: @test_vmov_n_p8(
7710 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
7711 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
7712 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
7713 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
7714 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
7715 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
7716 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
7717 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
7718 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
7719 poly8x8_t test_vmov_n_p8(poly8_t a) {
7720   return vmov_n_p8(a);
7721 }
7722
7723 // CHECK-LABEL: @test_vmov_n_p16(
7724 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
7725 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
7726 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
7727 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
7728 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
7729 poly16x4_t test_vmov_n_p16(poly16_t a) {
7730   return vmov_n_p16(a);
7731 }
7732
7733 // CHECK-LABEL: @test_vmov_n_f16(
7734 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
7735 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
7736 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
7737 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
7738 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
7739 // CHECK:   ret <4 x half> [[VECINIT3]]
7740 float16x4_t test_vmov_n_f16(float16_t *a) {
7741   return vmov_n_f16(*a);
7742 }
7743
7744 // CHECK-LABEL: @test_vmov_n_f32(
7745 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
7746 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
7747 // CHECK:   ret <2 x float> [[VECINIT1_I]]
7748 float32x2_t test_vmov_n_f32(float32_t a) {
7749   return vmov_n_f32(a);
7750 }
7751
7752 // CHECK-LABEL: @test_vmovq_n_u8(
7753 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
7754 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
7755 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
7756 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
7757 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
7758 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
7759 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
7760 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
7761 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
7762 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
7763 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
7764 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
7765 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
7766 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
7767 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
7768 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
7769 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
7770 uint8x16_t test_vmovq_n_u8(uint8_t a) {
7771   return vmovq_n_u8(a);
7772 }
7773
7774 // CHECK-LABEL: @test_vmovq_n_u16(
7775 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
7776 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
7777 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
7778 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
7779 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
7780 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
7781 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
7782 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
7783 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
7784 uint16x8_t test_vmovq_n_u16(uint16_t a) {
7785   return vmovq_n_u16(a);
7786 }
7787
7788 // CHECK-LABEL: @test_vmovq_n_u32(
7789 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
7790 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
7791 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
7792 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
7793 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
7794 uint32x4_t test_vmovq_n_u32(uint32_t a) {
7795   return vmovq_n_u32(a);
7796 }
7797
7798 // CHECK-LABEL: @test_vmovq_n_s8(
7799 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
7800 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
7801 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
7802 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
7803 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
7804 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
7805 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
7806 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
7807 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
7808 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
7809 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
7810 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
7811 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
7812 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
7813 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
7814 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
7815 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
7816 int8x16_t test_vmovq_n_s8(int8_t a) {
7817   return vmovq_n_s8(a);
7818 }
7819
7820 // CHECK-LABEL: @test_vmovq_n_s16(
7821 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
7822 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
7823 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
7824 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
7825 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
7826 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
7827 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
7828 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
7829 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
7830 int16x8_t test_vmovq_n_s16(int16_t a) {
7831   return vmovq_n_s16(a);
7832 }
7833
7834 // CHECK-LABEL: @test_vmovq_n_s32(
7835 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
7836 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
7837 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
7838 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
7839 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
7840 int32x4_t test_vmovq_n_s32(int32_t a) {
7841   return vmovq_n_s32(a);
7842 }
7843
7844 // CHECK-LABEL: @test_vmovq_n_p8(
7845 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
7846 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
7847 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
7848 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
7849 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
7850 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
7851 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
7852 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
7853 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
7854 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
7855 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
7856 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
7857 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
7858 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
7859 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
7860 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
7861 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
7862 poly8x16_t test_vmovq_n_p8(poly8_t a) {
7863   return vmovq_n_p8(a);
7864 }
7865
7866 // CHECK-LABEL: @test_vmovq_n_p16(
7867 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
7868 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
7869 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
7870 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
7871 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
7872 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
7873 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
7874 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
7875 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
7876 poly16x8_t test_vmovq_n_p16(poly16_t a) {
7877   return vmovq_n_p16(a);
7878 }
7879
7880 // CHECK-LABEL: @test_vmovq_n_f16(
7881 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
7882 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
7883 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
7884 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
7885 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
7886 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
7887 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
7888 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
7889 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
7890 // CHECK:   ret <8 x half> [[VECINIT7]]
7891 float16x8_t test_vmovq_n_f16(float16_t *a) {
7892   return vmovq_n_f16(*a);
7893 }
7894
7895 // CHECK-LABEL: @test_vmovq_n_f32(
7896 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
7897 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
7898 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
7899 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
7900 // CHECK:   ret <4 x float> [[VECINIT3_I]]
7901 float32x4_t test_vmovq_n_f32(float32_t a) {
7902   return vmovq_n_f32(a);
7903 }
7904
7905 // CHECK-LABEL: @test_vmov_n_s64(
7906 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
7907 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
7908 // CHECK:   ret <1 x i64> [[ADD_I]]
7909 int64x1_t test_vmov_n_s64(int64_t a) {
7910   int64x1_t tmp = vmov_n_s64(a);
7911   return vadd_s64(tmp, tmp);
7912 }
7913
7914 // CHECK-LABEL: @test_vmov_n_u64(
7915 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
7916 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
7917 // CHECK:   ret <1 x i64> [[ADD_I]]
7918 uint64x1_t test_vmov_n_u64(uint64_t a) {
7919   uint64x1_t tmp = vmov_n_u64(a);
7920   return vadd_u64(tmp, tmp);
7921 }
7922
7923 // CHECK-LABEL: @test_vmovq_n_s64(
7924 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
7925 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
7926 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
7927 int64x2_t test_vmovq_n_s64(int64_t a) {
7928   return vmovq_n_s64(a);
7929 }
7930
7931 // CHECK-LABEL: @test_vmovq_n_u64(
7932 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
7933 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
7934 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
7935 uint64x2_t test_vmovq_n_u64(uint64_t a) {
7936   return vmovq_n_u64(a);
7937 }
7938
7939 // CHECK-LABEL: @test_vmul_s8(
7940 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
7941 // CHECK:   ret <8 x i8> [[MUL_I]]
7942 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
7943   return vmul_s8(a, b);
7944 }
7945
7946 // CHECK-LABEL: @test_vmul_s16(
7947 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
7948 // CHECK:   ret <4 x i16> [[MUL_I]]
7949 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
7950   return vmul_s16(a, b);
7951 }
7952
7953 // CHECK-LABEL: @test_vmul_s32(
7954 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
7955 // CHECK:   ret <2 x i32> [[MUL_I]]
7956 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
7957   return vmul_s32(a, b);
7958 }
7959
7960 // CHECK-LABEL: @test_vmul_f32(
7961 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
7962 // CHECK:   ret <2 x float> [[MUL_I]]
7963 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
7964   return vmul_f32(a, b);
7965 }
7966
7967 // CHECK-LABEL: @test_vmul_u8(
7968 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
7969 // CHECK:   ret <8 x i8> [[MUL_I]]
7970 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
7971   return vmul_u8(a, b);
7972 }
7973
7974 // CHECK-LABEL: @test_vmul_u16(
7975 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
7976 // CHECK:   ret <4 x i16> [[MUL_I]]
7977 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
7978   return vmul_u16(a, b);
7979 }
7980
7981 // CHECK-LABEL: @test_vmul_u32(
7982 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
7983 // CHECK:   ret <2 x i32> [[MUL_I]]
7984 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
7985   return vmul_u32(a, b);
7986 }
7987
7988 // CHECK-LABEL: @test_vmulq_s8(
7989 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
7990 // CHECK:   ret <16 x i8> [[MUL_I]]
7991 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
7992   return vmulq_s8(a, b);
7993 }
7994
7995 // CHECK-LABEL: @test_vmulq_s16(
7996 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
7997 // CHECK:   ret <8 x i16> [[MUL_I]]
7998 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
7999   return vmulq_s16(a, b);
8000 }
8001
8002 // CHECK-LABEL: @test_vmulq_s32(
8003 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
8004 // CHECK:   ret <4 x i32> [[MUL_I]]
8005 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
8006   return vmulq_s32(a, b);
8007 }
8008
8009 // CHECK-LABEL: @test_vmulq_f32(
8010 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
8011 // CHECK:   ret <4 x float> [[MUL_I]]
8012 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
8013   return vmulq_f32(a, b);
8014 }
8015
8016 // CHECK-LABEL: @test_vmulq_u8(
8017 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
8018 // CHECK:   ret <16 x i8> [[MUL_I]]
8019 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
8020   return vmulq_u8(a, b);
8021 }
8022
8023 // CHECK-LABEL: @test_vmulq_u16(
8024 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
8025 // CHECK:   ret <8 x i16> [[MUL_I]]
8026 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
8027   return vmulq_u16(a, b);
8028 }
8029
8030 // CHECK-LABEL: @test_vmulq_u32(
8031 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
8032 // CHECK:   ret <4 x i32> [[MUL_I]]
8033 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
8034   return vmulq_u32(a, b);
8035 }
8036
8037 // CHECK-LABEL: @test_vmull_s8(
8038 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
8039 // CHECK:   ret <8 x i16> [[VMULL_I]]
8040 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
8041   return vmull_s8(a, b);
8042 }
8043
8044 // CHECK-LABEL: @test_vmull_s16(
8045 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8046 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8047 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
8048 // CHECK:   ret <4 x i32> [[VMULL2_I]]
8049 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
8050   return vmull_s16(a, b);
8051 }
8052
8053 // CHECK-LABEL: @test_vmull_s32(
8054 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8055 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8056 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
8057 // CHECK:   ret <2 x i64> [[VMULL2_I]]
8058 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
8059   return vmull_s32(a, b);
8060 }
8061
8062 // CHECK-LABEL: @test_vmull_u8(
8063 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
8064 // CHECK:   ret <8 x i16> [[VMULL_I]]
8065 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
8066   return vmull_u8(a, b);
8067 }
8068
8069 // CHECK-LABEL: @test_vmull_u16(
8070 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8071 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8072 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
8073 // CHECK:   ret <4 x i32> [[VMULL2_I]]
8074 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
8075   return vmull_u16(a, b);
8076 }
8077
8078 // CHECK-LABEL: @test_vmull_u32(
8079 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8080 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8081 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
8082 // CHECK:   ret <2 x i64> [[VMULL2_I]]
8083 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
8084   return vmull_u32(a, b);
8085 }
8086
8087 // CHECK-LABEL: @test_vmull_p8(
8088 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
8089 // CHECK:   ret <8 x i16> [[VMULL_I]]
8090 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
8091   return vmull_p8(a, b);
8092 }
8093
8094 // CHECK-LABEL: @test_vmull_lane_s16(
8095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8096 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8097 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8098 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
8099 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
8100 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
8101 // CHECK:   ret <4 x i32> [[VMULL2_I]]
8102 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
8103   return vmull_lane_s16(a, b, 3);
8104 }
8105
8106 // CHECK-LABEL: @test_vmull_lane_s32(
8107 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8108 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8109 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
8110 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
8111 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
8112 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
8113 // CHECK:   ret <2 x i64> [[VMULL2_I]]
8114 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
8115   return vmull_lane_s32(a, b, 1);
8116 }
8117
8118 // CHECK-LABEL: @test_vmull_lane_u16(
8119 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8120 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8121 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8122 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
8123 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
8124 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
8125 // CHECK:   ret <4 x i32> [[VMULL2_I]]
8126 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
8127   return vmull_lane_u16(a, b, 3);
8128 }
8129
8130 // CHECK-LABEL: @test_vmull_lane_u32(
8131 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8132 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8133 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
8134 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
8135 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
8136 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
8137 // CHECK:   ret <2 x i64> [[VMULL2_I]]
8138 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
8139   return vmull_lane_u32(a, b, 1);
8140 }
8141
8142 // CHECK-LABEL: @test_vmull_n_s16(
8143 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8144 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8145 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8146 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8147 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8148 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
8149 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
8150 // CHECK:   ret <4 x i32> [[VMULL5_I]]
8151 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
8152   return vmull_n_s16(a, b);
8153 }
8154
8155 // CHECK-LABEL: @test_vmull_n_s32(
8156 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8157 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8158 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8159 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
8160 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
8161 // CHECK:   ret <2 x i64> [[VMULL3_I]]
8162 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
8163   return vmull_n_s32(a, b);
8164 }
8165
8166 // CHECK-LABEL: @test_vmull_n_u16(
8167 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8168 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8169 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8170 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8172 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
8173 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
8174 // CHECK:   ret <4 x i32> [[VMULL5_I]]
8175 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
8176   return vmull_n_u16(a, b);
8177 }
8178
8179 // CHECK-LABEL: @test_vmull_n_u32(
8180 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8181 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8182 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8183 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
8184 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
8185 // CHECK:   ret <2 x i64> [[VMULL3_I]]
8186 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
8187   return vmull_n_u32(a, b);
8188 }
8189
8190 // CHECK-LABEL: @test_vmul_p8(
8191 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b)
8192 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
8193 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
8194   return vmul_p8(a, b);
8195 }
8196
8197 // CHECK-LABEL: @test_vmulq_p8(
8198 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b)
8199 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
8200 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
8201   return vmulq_p8(a, b);
8202 }
8203
8204 // CHECK-LABEL: @test_vmul_lane_s16(
8205 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8206 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8207 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8208 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
8209 // CHECK:   ret <4 x i16> [[MUL]]
8210 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
8211   return vmul_lane_s16(a, b, 3);
8212 }
8213
8214 // CHECK-LABEL: @test_vmul_lane_s32(
8215 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8216 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8217 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
8218 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
8219 // CHECK:   ret <2 x i32> [[MUL]]
8220 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
8221   return vmul_lane_s32(a, b, 1);
8222 }
8223
8224 // CHECK-LABEL: @test_vmul_lane_f32(
8225 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
8226 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8227 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
8228 // CHECK:   [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
8229 // CHECK:   ret <2 x float> [[MUL]]
8230 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
8231   return vmul_lane_f32(a, b, 1);
8232 }
8233
8234 // CHECK-LABEL: @test_vmul_lane_u16(
8235 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8236 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8237 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8238 // CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
8239 // CHECK:   ret <4 x i16> [[MUL]]
8240 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
8241   return vmul_lane_u16(a, b, 3);
8242 }
8243
8244 // CHECK-LABEL: @test_vmul_lane_u32(
8245 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8246 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8247 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
8248 // CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
8249 // CHECK:   ret <2 x i32> [[MUL]]
8250 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
8251   return vmul_lane_u32(a, b, 1);
8252 }
8253
8254 // CHECK-LABEL: @test_vmulq_lane_s16(
8255 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8256 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8257 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
8258 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
8259 // CHECK:   ret <8 x i16> [[MUL]]
8260 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
8261   return vmulq_lane_s16(a, b, 3);
8262 }
8263
8264 // CHECK-LABEL: @test_vmulq_lane_s32(
8265 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8266 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8267 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8268 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
8269 // CHECK:   ret <4 x i32> [[MUL]]
8270 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
8271   return vmulq_lane_s32(a, b, 1);
8272 }
8273
8274 // CHECK-LABEL: @test_vmulq_lane_f32(
8275 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
8276 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8277 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8278 // CHECK:   [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
8279 // CHECK:   ret <4 x float> [[MUL]]
8280 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
8281   return vmulq_lane_f32(a, b, 1);
8282 }
8283
8284 // CHECK-LABEL: @test_vmulq_lane_u16(
8285 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
8286 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8287 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
8288 // CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
8289 // CHECK:   ret <8 x i16> [[MUL]]
8290 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
8291   return vmulq_lane_u16(a, b, 3);
8292 }
8293
8294 // CHECK-LABEL: @test_vmulq_lane_u32(
8295 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
8296 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8297 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8298 // CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
8299 // CHECK:   ret <4 x i32> [[MUL]]
8300 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
8301   return vmulq_lane_u32(a, b, 1);
8302 }
8303
8304 // CHECK-LABEL: @test_vmul_n_s16(
8305 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8306 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8307 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8308 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8309 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
8310 // CHECK:   ret <4 x i16> [[MUL_I]]
8311 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
8312   return vmul_n_s16(a, b);
8313 }
8314
8315 // CHECK-LABEL: @test_vmul_n_s32(
8316 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8317 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8318 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
8319 // CHECK:   ret <2 x i32> [[MUL_I]]
8320 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
8321   return vmul_n_s32(a, b);
8322 }
8323
8324 // CHECK-LABEL: @test_vmul_n_f32(
8325 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
8326 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
8327 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
8328 // CHECK:   ret <2 x float> [[MUL_I]]
8329 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
8330   return vmul_n_f32(a, b);
8331 }
8332
8333 // CHECK-LABEL: @test_vmul_n_u16(
8334 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8335 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8336 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8337 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8338 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
8339 // CHECK:   ret <4 x i16> [[MUL_I]]
8340 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
8341   return vmul_n_u16(a, b);
8342 }
8343
8344 // CHECK-LABEL: @test_vmul_n_u32(
8345 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8346 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8347 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
8348 // CHECK:   ret <2 x i32> [[MUL_I]]
8349 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
8350   return vmul_n_u32(a, b);
8351 }
8352
8353 // CHECK-LABEL: @test_vmulq_n_s16(
8354 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
8355 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
8356 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
8357 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
8358 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
8359 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
8360 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
8361 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
8362 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
8363 // CHECK:   ret <8 x i16> [[MUL_I]]
8364 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
8365   return vmulq_n_s16(a, b);
8366 }
8367
8368 // CHECK-LABEL: @test_vmulq_n_s32(
8369 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
8370 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
8371 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
8372 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
8373 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
8374 // CHECK:   ret <4 x i32> [[MUL_I]]
8375 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
8376   return vmulq_n_s32(a, b);
8377 }
8378
8379 // CHECK-LABEL: @test_vmulq_n_f32(
8380 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
8381 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
8382 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
8383 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
8384 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
8385 // CHECK:   ret <4 x float> [[MUL_I]]
8386 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
8387   return vmulq_n_f32(a, b);
8388 }
8389
8390 // CHECK-LABEL: @test_vmulq_n_u16(
8391 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
8392 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
8393 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
8394 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
8395 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
8396 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
8397 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
8398 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
8399 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
8400 // CHECK:   ret <8 x i16> [[MUL_I]]
8401 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
8402   return vmulq_n_u16(a, b);
8403 }
8404
8405 // CHECK-LABEL: @test_vmulq_n_u32(
8406 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
8407 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
8408 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
8409 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
8410 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
8411 // CHECK:   ret <4 x i32> [[MUL_I]]
8412 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
8413   return vmulq_n_u32(a, b);
8414 }
8415
8416 // CHECK-LABEL: @test_vmvn_s8(
8417 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8418 // CHECK:   ret <8 x i8> [[NEG_I]]
8419 int8x8_t test_vmvn_s8(int8x8_t a) {
8420   return vmvn_s8(a);
8421 }
8422
8423 // CHECK-LABEL: @test_vmvn_s16(
8424 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
8425 // CHECK:   ret <4 x i16> [[NEG_I]]
8426 int16x4_t test_vmvn_s16(int16x4_t a) {
8427   return vmvn_s16(a);
8428 }
8429
8430 // CHECK-LABEL: @test_vmvn_s32(
8431 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
8432 // CHECK:   ret <2 x i32> [[NEG_I]]
8433 int32x2_t test_vmvn_s32(int32x2_t a) {
8434   return vmvn_s32(a);
8435 }
8436
8437 // CHECK-LABEL: @test_vmvn_u8(
8438 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8439 // CHECK:   ret <8 x i8> [[NEG_I]]
8440 uint8x8_t test_vmvn_u8(uint8x8_t a) {
8441   return vmvn_u8(a);
8442 }
8443
8444 // CHECK-LABEL: @test_vmvn_u16(
8445 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
8446 // CHECK:   ret <4 x i16> [[NEG_I]]
8447 uint16x4_t test_vmvn_u16(uint16x4_t a) {
8448   return vmvn_u16(a);
8449 }
8450
8451 // CHECK-LABEL: @test_vmvn_u32(
8452 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
8453 // CHECK:   ret <2 x i32> [[NEG_I]]
8454 uint32x2_t test_vmvn_u32(uint32x2_t a) {
8455   return vmvn_u32(a);
8456 }
8457
8458 // CHECK-LABEL: @test_vmvn_p8(
8459 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8460 // CHECK:   ret <8 x i8> [[NEG_I]]
8461 poly8x8_t test_vmvn_p8(poly8x8_t a) {
8462   return vmvn_p8(a);
8463 }
8464
8465 // CHECK-LABEL: @test_vmvnq_s8(
8466 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8467 // CHECK:   ret <16 x i8> [[NEG_I]]
8468 int8x16_t test_vmvnq_s8(int8x16_t a) {
8469   return vmvnq_s8(a);
8470 }
8471
8472 // CHECK-LABEL: @test_vmvnq_s16(
8473 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8474 // CHECK:   ret <8 x i16> [[NEG_I]]
8475 int16x8_t test_vmvnq_s16(int16x8_t a) {
8476   return vmvnq_s16(a);
8477 }
8478
8479 // CHECK-LABEL: @test_vmvnq_s32(
8480 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
8481 // CHECK:   ret <4 x i32> [[NEG_I]]
8482 int32x4_t test_vmvnq_s32(int32x4_t a) {
8483   return vmvnq_s32(a);
8484 }
8485
8486 // CHECK-LABEL: @test_vmvnq_u8(
8487 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8488 // CHECK:   ret <16 x i8> [[NEG_I]]
8489 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
8490   return vmvnq_u8(a);
8491 }
8492
8493 // CHECK-LABEL: @test_vmvnq_u16(
8494 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8495 // CHECK:   ret <8 x i16> [[NEG_I]]
8496 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
8497   return vmvnq_u16(a);
8498 }
8499
8500 // CHECK-LABEL: @test_vmvnq_u32(
8501 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
8502 // CHECK:   ret <4 x i32> [[NEG_I]]
8503 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
8504   return vmvnq_u32(a);
8505 }
8506
8507 // CHECK-LABEL: @test_vmvnq_p8(
8508 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8509 // CHECK:   ret <16 x i8> [[NEG_I]]
8510 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
8511   return vmvnq_p8(a);
8512 }
8513
8514 // CHECK-LABEL: @test_vneg_s8(
8515 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
8516 // CHECK:   ret <8 x i8> [[SUB_I]]
8517 int8x8_t test_vneg_s8(int8x8_t a) {
8518   return vneg_s8(a);
8519 }
8520
8521 // CHECK-LABEL: @test_vneg_s16(
8522 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
8523 // CHECK:   ret <4 x i16> [[SUB_I]]
8524 int16x4_t test_vneg_s16(int16x4_t a) {
8525   return vneg_s16(a);
8526 }
8527
8528 // CHECK-LABEL: @test_vneg_s32(
8529 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
8530 // CHECK:   ret <2 x i32> [[SUB_I]]
8531 int32x2_t test_vneg_s32(int32x2_t a) {
8532   return vneg_s32(a);
8533 }
8534
8535 // CHECK-LABEL: @test_vneg_f32(
8536 // CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %a
8537 // CHECK:   ret <2 x float> [[SUB_I]]
8538 float32x2_t test_vneg_f32(float32x2_t a) {
8539   return vneg_f32(a);
8540 }
8541
8542 // CHECK-LABEL: @test_vnegq_s8(
8543 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
8544 // CHECK:   ret <16 x i8> [[SUB_I]]
8545 int8x16_t test_vnegq_s8(int8x16_t a) {
8546   return vnegq_s8(a);
8547 }
8548
8549 // CHECK-LABEL: @test_vnegq_s16(
8550 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
8551 // CHECK:   ret <8 x i16> [[SUB_I]]
8552 int16x8_t test_vnegq_s16(int16x8_t a) {
8553   return vnegq_s16(a);
8554 }
8555
8556 // CHECK-LABEL: @test_vnegq_s32(
8557 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
8558 // CHECK:   ret <4 x i32> [[SUB_I]]
8559 int32x4_t test_vnegq_s32(int32x4_t a) {
8560   return vnegq_s32(a);
8561 }
8562
8563 // CHECK-LABEL: @test_vnegq_f32(
8564 // CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %a
8565 // CHECK:   ret <4 x float> [[SUB_I]]
8566 float32x4_t test_vnegq_f32(float32x4_t a) {
8567   return vnegq_f32(a);
8568 }
8569
8570 // CHECK-LABEL: @test_vorn_s8(
8571 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8572 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
8573 // CHECK:   ret <8 x i8> [[OR_I]]
8574 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
8575   return vorn_s8(a, b);
8576 }
8577
8578 // CHECK-LABEL: @test_vorn_s16(
8579 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
8580 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
8581 // CHECK:   ret <4 x i16> [[OR_I]]
8582 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
8583   return vorn_s16(a, b);
8584 }
8585
8586 // CHECK-LABEL: @test_vorn_s32(
8587 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
8588 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
8589 // CHECK:   ret <2 x i32> [[OR_I]]
8590 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
8591   return vorn_s32(a, b);
8592 }
8593
8594 // CHECK-LABEL: @test_vorn_s64(
8595 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
8596 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
8597 // CHECK:   ret <1 x i64> [[OR_I]]
8598 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
8599   return vorn_s64(a, b);
8600 }
8601
8602 // CHECK-LABEL: @test_vorn_u8(
8603 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8604 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
8605 // CHECK:   ret <8 x i8> [[OR_I]]
8606 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
8607   return vorn_u8(a, b);
8608 }
8609
8610 // CHECK-LABEL: @test_vorn_u16(
8611 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
8612 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
8613 // CHECK:   ret <4 x i16> [[OR_I]]
8614 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
8615   return vorn_u16(a, b);
8616 }
8617
8618 // CHECK-LABEL: @test_vorn_u32(
8619 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
8620 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
8621 // CHECK:   ret <2 x i32> [[OR_I]]
8622 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
8623   return vorn_u32(a, b);
8624 }
8625
8626 // CHECK-LABEL: @test_vorn_u64(
8627 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
8628 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
8629 // CHECK:   ret <1 x i64> [[OR_I]]
8630 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
8631   return vorn_u64(a, b);
8632 }
8633
8634 // CHECK-LABEL: @test_vornq_s8(
8635 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8636 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
8637 // CHECK:   ret <16 x i8> [[OR_I]]
8638 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
8639   return vornq_s8(a, b);
8640 }
8641
8642 // CHECK-LABEL: @test_vornq_s16(
8643 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8644 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
8645 // CHECK:   ret <8 x i16> [[OR_I]]
8646 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
8647   return vornq_s16(a, b);
8648 }
8649
8650 // CHECK-LABEL: @test_vornq_s32(
8651 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
8652 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
8653 // CHECK:   ret <4 x i32> [[OR_I]]
8654 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
8655   return vornq_s32(a, b);
8656 }
8657
8658 // CHECK-LABEL: @test_vornq_s64(
8659 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
8660 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
8661 // CHECK:   ret <2 x i64> [[OR_I]]
8662 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
8663   return vornq_s64(a, b);
8664 }
8665
8666 // CHECK-LABEL: @test_vornq_u8(
8667 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8668 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
8669 // CHECK:   ret <16 x i8> [[OR_I]]
8670 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
8671   return vornq_u8(a, b);
8672 }
8673
8674 // CHECK-LABEL: @test_vornq_u16(
8675 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8676 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
8677 // CHECK:   ret <8 x i16> [[OR_I]]
8678 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
8679   return vornq_u16(a, b);
8680 }
8681
8682 // CHECK-LABEL: @test_vornq_u32(
8683 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
8684 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
8685 // CHECK:   ret <4 x i32> [[OR_I]]
8686 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
8687   return vornq_u32(a, b);
8688 }
8689
8690 // CHECK-LABEL: @test_vornq_u64(
8691 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
8692 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
8693 // CHECK:   ret <2 x i64> [[OR_I]]
8694 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
8695   return vornq_u64(a, b);
8696 }
8697
8698 // CHECK-LABEL: @test_vorr_s8(
8699 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
8700 // CHECK:   ret <8 x i8> [[OR_I]]
8701 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
8702   return vorr_s8(a, b);
8703 }
8704
8705 // CHECK-LABEL: @test_vorr_s16(
8706 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
8707 // CHECK:   ret <4 x i16> [[OR_I]]
8708 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
8709   return vorr_s16(a, b);
8710 }
8711
8712 // CHECK-LABEL: @test_vorr_s32(
8713 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
8714 // CHECK:   ret <2 x i32> [[OR_I]]
8715 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
8716   return vorr_s32(a, b);
8717 }
8718
8719 // CHECK-LABEL: @test_vorr_s64(
8720 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
8721 // CHECK:   ret <1 x i64> [[OR_I]]
8722 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
8723   return vorr_s64(a, b);
8724 }
8725
8726 // CHECK-LABEL: @test_vorr_u8(
8727 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
8728 // CHECK:   ret <8 x i8> [[OR_I]]
8729 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
8730   return vorr_u8(a, b);
8731 }
8732
8733 // CHECK-LABEL: @test_vorr_u16(
8734 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
8735 // CHECK:   ret <4 x i16> [[OR_I]]
8736 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
8737   return vorr_u16(a, b);
8738 }
8739
8740 // CHECK-LABEL: @test_vorr_u32(
8741 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
8742 // CHECK:   ret <2 x i32> [[OR_I]]
8743 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
8744   return vorr_u32(a, b);
8745 }
8746
8747 // CHECK-LABEL: @test_vorr_u64(
8748 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
8749 // CHECK:   ret <1 x i64> [[OR_I]]
8750 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
8751   return vorr_u64(a, b);
8752 }
8753
8754 // CHECK-LABEL: @test_vorrq_s8(
8755 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
8756 // CHECK:   ret <16 x i8> [[OR_I]]
8757 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
8758   return vorrq_s8(a, b);
8759 }
8760
8761 // CHECK-LABEL: @test_vorrq_s16(
8762 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
8763 // CHECK:   ret <8 x i16> [[OR_I]]
8764 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
8765   return vorrq_s16(a, b);
8766 }
8767
8768 // CHECK-LABEL: @test_vorrq_s32(
8769 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
8770 // CHECK:   ret <4 x i32> [[OR_I]]
8771 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
8772   return vorrq_s32(a, b);
8773 }
8774
8775 // CHECK-LABEL: @test_vorrq_s64(
8776 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
8777 // CHECK:   ret <2 x i64> [[OR_I]]
8778 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
8779   return vorrq_s64(a, b);
8780 }
8781
8782 // CHECK-LABEL: @test_vorrq_u8(
8783 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
8784 // CHECK:   ret <16 x i8> [[OR_I]]
8785 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
8786   return vorrq_u8(a, b);
8787 }
8788
8789 // CHECK-LABEL: @test_vorrq_u16(
8790 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
8791 // CHECK:   ret <8 x i16> [[OR_I]]
8792 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
8793   return vorrq_u16(a, b);
8794 }
8795
8796 // CHECK-LABEL: @test_vorrq_u32(
8797 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
8798 // CHECK:   ret <4 x i32> [[OR_I]]
8799 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
8800   return vorrq_u32(a, b);
8801 }
8802
8803 // CHECK-LABEL: @test_vorrq_u64(
8804 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
8805 // CHECK:   ret <2 x i64> [[OR_I]]
8806 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
8807   return vorrq_u64(a, b);
8808 }
8809
8810 // CHECK-LABEL: @test_vpadal_s8(
8811 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8812 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
8813 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
8814 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
8815   return vpadal_s8(a, b);
8816 }
8817
8818 // CHECK-LABEL: @test_vpadal_s16(
8819 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8820 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8821 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
8822 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
8823 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
8824   return vpadal_s16(a, b);
8825 }
8826
8827 // CHECK-LABEL: @test_vpadal_s32(
8828 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
8829 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8830 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
8831 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
8832 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
8833   return vpadal_s32(a, b);
8834 }
8835
8836 // CHECK-LABEL: @test_vpadal_u8(
8837 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8838 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
8839 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
8840 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
8841   return vpadal_u8(a, b);
8842 }
8843
8844 // CHECK-LABEL: @test_vpadal_u16(
8845 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8846 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8847 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
8848 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
8849 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
8850   return vpadal_u16(a, b);
8851 }
8852
8853 // CHECK-LABEL: @test_vpadal_u32(
8854 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
8855 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8856 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
8857 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
8858 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
8859   return vpadal_u32(a, b);
8860 }
8861
8862 // CHECK-LABEL: @test_vpadalq_s8(
8863 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8864 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
8865 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
8866 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
8867   return vpadalq_s8(a, b);
8868 }
8869
8870 // CHECK-LABEL: @test_vpadalq_s16(
8871 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8872 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8873 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
8874 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
8875 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
8876   return vpadalq_s16(a, b);
8877 }
8878
8879 // CHECK-LABEL: @test_vpadalq_s32(
8880 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
8881 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8882 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
8883 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
8884 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
8885   return vpadalq_s32(a, b);
8886 }
8887
8888 // CHECK-LABEL: @test_vpadalq_u8(
8889 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8890 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
8891 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
8892 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
8893   return vpadalq_u8(a, b);
8894 }
8895
8896 // CHECK-LABEL: @test_vpadalq_u16(
8897 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8898 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8899 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
8900 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
8901 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
8902   return vpadalq_u16(a, b);
8903 }
8904
8905 // CHECK-LABEL: @test_vpadalq_u32(
8906 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
8907 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8908 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
8909 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
8910 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
8911   return vpadalq_u32(a, b);
8912 }
8913
8914 // CHECK-LABEL: @test_vpadd_s8(
8915 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
8916 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
8917 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
8918   return vpadd_s8(a, b);
8919 }
8920
8921 // CHECK-LABEL: @test_vpadd_s16(
8922 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8923 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8924 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
8925 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
8926 // CHECK:   ret <4 x i16> [[VPADD_V2_I]]
8927 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
8928   return vpadd_s16(a, b);
8929 }
8930
8931 // CHECK-LABEL: @test_vpadd_s32(
8932 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8933 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8934 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
8935 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
8936 // CHECK:   ret <2 x i32> [[VPADD_V2_I]]
8937 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
8938   return vpadd_s32(a, b);
8939 }
8940
8941 // CHECK-LABEL: @test_vpadd_u8(
8942 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
8943 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
8944 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
8945   return vpadd_u8(a, b);
8946 }
8947
8948 // CHECK-LABEL: @test_vpadd_u16(
8949 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8950 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8951 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
8952 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
8953 // CHECK:   ret <4 x i16> [[VPADD_V2_I]]
8954 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
8955   return vpadd_u16(a, b);
8956 }
8957
8958 // CHECK-LABEL: @test_vpadd_u32(
8959 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8960 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8961 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
8962 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
8963 // CHECK:   ret <2 x i32> [[VPADD_V2_I]]
8964 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
8965   return vpadd_u32(a, b);
8966 }
8967
8968 // CHECK-LABEL: @test_vpadd_f32(
8969 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
8970 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
8971 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b)
8972 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
8973 // CHECK:   ret <2 x float> [[VPADD_V2_I]]
8974 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
8975   return vpadd_f32(a, b);
8976 }
8977
8978 // CHECK-LABEL: @test_vpaddl_s8(
8979 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a)
8980 // CHECK:   ret <4 x i16> [[VPADDL_I]]
8981 int16x4_t test_vpaddl_s8(int8x8_t a) {
8982   return vpaddl_s8(a);
8983 }
8984
8985 // CHECK-LABEL: @test_vpaddl_s16(
8986 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8987 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a)
8988 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
8989 int32x2_t test_vpaddl_s16(int16x4_t a) {
8990   return vpaddl_s16(a);
8991 }
8992
8993 // CHECK-LABEL: @test_vpaddl_s32(
8994 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8995 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a)
8996 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
8997 int64x1_t test_vpaddl_s32(int32x2_t a) {
8998   return vpaddl_s32(a);
8999 }
9000
9001 // CHECK-LABEL: @test_vpaddl_u8(
9002 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a)
9003 // CHECK:   ret <4 x i16> [[VPADDL_I]]
9004 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
9005   return vpaddl_u8(a);
9006 }
9007
9008 // CHECK-LABEL: @test_vpaddl_u16(
9009 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9010 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a)
9011 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
9012 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
9013   return vpaddl_u16(a);
9014 }
9015
9016 // CHECK-LABEL: @test_vpaddl_u32(
9017 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9018 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a)
9019 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
9020 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
9021   return vpaddl_u32(a);
9022 }
9023
9024 // CHECK-LABEL: @test_vpaddlq_s8(
9025 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a)
9026 // CHECK:   ret <8 x i16> [[VPADDL_I]]
9027 int16x8_t test_vpaddlq_s8(int8x16_t a) {
9028   return vpaddlq_s8(a);
9029 }
9030
9031 // CHECK-LABEL: @test_vpaddlq_s16(
9032 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9033 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a)
9034 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
9035 int32x4_t test_vpaddlq_s16(int16x8_t a) {
9036   return vpaddlq_s16(a);
9037 }
9038
9039 // CHECK-LABEL: @test_vpaddlq_s32(
9040 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9041 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a)
9042 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
9043 int64x2_t test_vpaddlq_s32(int32x4_t a) {
9044   return vpaddlq_s32(a);
9045 }
9046
9047 // CHECK-LABEL: @test_vpaddlq_u8(
9048 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a)
9049 // CHECK:   ret <8 x i16> [[VPADDL_I]]
9050 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
9051   return vpaddlq_u8(a);
9052 }
9053
9054 // CHECK-LABEL: @test_vpaddlq_u16(
9055 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9056 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a)
9057 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
9058 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
9059   return vpaddlq_u16(a);
9060 }
9061
9062 // CHECK-LABEL: @test_vpaddlq_u32(
9063 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9064 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a)
9065 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
9066 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
9067   return vpaddlq_u32(a);
9068 }
9069
9070 // CHECK-LABEL: @test_vpmax_s8(
9071 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
9072 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
9073 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
9074   return vpmax_s8(a, b);
9075 }
9076
9077 // CHECK-LABEL: @test_vpmax_s16(
9078 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9079 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9080 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
9081 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
9082 // CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
9083 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
9084   return vpmax_s16(a, b);
9085 }
9086
9087 // CHECK-LABEL: @test_vpmax_s32(
9088 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9089 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9090 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
9091 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
9092 // CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
9093 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
9094   return vpmax_s32(a, b);
9095 }
9096
9097 // CHECK-LABEL: @test_vpmax_u8(
9098 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
9099 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
9100 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
9101   return vpmax_u8(a, b);
9102 }
9103
9104 // CHECK-LABEL: @test_vpmax_u16(
9105 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9106 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9107 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
9108 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
9109 // CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
9110 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
9111   return vpmax_u16(a, b);
9112 }
9113
9114 // CHECK-LABEL: @test_vpmax_u32(
9115 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9116 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9117 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
9118 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
9119 // CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
9120 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
9121   return vpmax_u32(a, b);
9122 }
9123
9124 // CHECK-LABEL: @test_vpmax_f32(
9125 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
9126 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
9127 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b)
9128 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
9129 // CHECK:   ret <2 x float> [[VPMAX_V2_I]]
9130 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
9131   return vpmax_f32(a, b);
9132 }
9133
9134 // CHECK-LABEL: @test_vpmin_s8(
9135 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b)
9136 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
9137 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
9138   return vpmin_s8(a, b);
9139 }
9140
9141 // CHECK-LABEL: @test_vpmin_s16(
9142 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9143 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9144 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b)
9145 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
9146 // CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
9147 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
9148   return vpmin_s16(a, b);
9149 }
9150
9151 // CHECK-LABEL: @test_vpmin_s32(
9152 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9153 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9154 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b)
9155 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
9156 // CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
9157 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
9158   return vpmin_s32(a, b);
9159 }
9160
9161 // CHECK-LABEL: @test_vpmin_u8(
9162 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b)
9163 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
9164 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
9165   return vpmin_u8(a, b);
9166 }
9167
9168 // CHECK-LABEL: @test_vpmin_u16(
9169 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9170 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9171 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b)
9172 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
9173 // CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
9174 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
9175   return vpmin_u16(a, b);
9176 }
9177
9178 // CHECK-LABEL: @test_vpmin_u32(
9179 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9180 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9181 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b)
9182 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
9183 // CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
9184 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
9185   return vpmin_u32(a, b);
9186 }
9187
9188 // CHECK-LABEL: @test_vpmin_f32(
9189 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
9190 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
9191 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b)
9192 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
9193 // CHECK:   ret <2 x float> [[VPMIN_V2_I]]
9194 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
9195   return vpmin_f32(a, b);
9196 }
9197
9198 // CHECK-LABEL: @test_vqabs_s8(
9199 // CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a)
9200 // CHECK:   ret <8 x i8> [[VQABS_V_I]]
9201 int8x8_t test_vqabs_s8(int8x8_t a) {
9202   return vqabs_s8(a);
9203 }
9204
9205 // CHECK-LABEL: @test_vqabs_s16(
9206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9207 // CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a)
9208 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
9209 // CHECK:   ret <4 x i16> [[VQABS_V1_I]]
9210 int16x4_t test_vqabs_s16(int16x4_t a) {
9211   return vqabs_s16(a);
9212 }
9213
9214 // CHECK-LABEL: @test_vqabs_s32(
9215 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9216 // CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a)
9217 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
9218 // CHECK:   ret <2 x i32> [[VQABS_V1_I]]
9219 int32x2_t test_vqabs_s32(int32x2_t a) {
9220   return vqabs_s32(a);
9221 }
9222
9223 // CHECK-LABEL: @test_vqabsq_s8(
9224 // CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a)
9225 // CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
9226 int8x16_t test_vqabsq_s8(int8x16_t a) {
9227   return vqabsq_s8(a);
9228 }
9229
9230 // CHECK-LABEL: @test_vqabsq_s16(
9231 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9232 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a)
9233 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
9234 // CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
9235 int16x8_t test_vqabsq_s16(int16x8_t a) {
9236   return vqabsq_s16(a);
9237 }
9238
9239 // CHECK-LABEL: @test_vqabsq_s32(
9240 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9241 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a)
9242 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
9243 // CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
9244 int32x4_t test_vqabsq_s32(int32x4_t a) {
9245   return vqabsq_s32(a);
9246 }
9247
9248 // CHECK-LABEL: @test_vqadd_s8(
9249 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
9250 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
9251 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
9252   return vqadd_s8(a, b);
9253 }
9254
9255 // CHECK-LABEL: @test_vqadd_s16(
9256 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9257 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9258 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
9259 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
9260 // CHECK:   ret <4 x i16> [[VQADD_V2_I]]
9261 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
9262   return vqadd_s16(a, b);
9263 }
9264
9265 // CHECK-LABEL: @test_vqadd_s32(
9266 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9267 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9268 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
9269 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
9270 // CHECK:   ret <2 x i32> [[VQADD_V2_I]]
9271 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
9272   return vqadd_s32(a, b);
9273 }
9274
9275 // CHECK-LABEL: @test_vqadd_s64(
9276 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9277 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
9278 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
9279 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
9280 // CHECK:   ret <1 x i64> [[VQADD_V2_I]]
9281 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
9282   return vqadd_s64(a, b);
9283 }
9284
9285 // CHECK-LABEL: @test_vqadd_u8(
9286 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
9287 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
9288 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
9289   return vqadd_u8(a, b);
9290 }
9291
9292 // CHECK-LABEL: @test_vqadd_u16(
9293 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9294 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9295 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
9296 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
9297 // CHECK:   ret <4 x i16> [[VQADD_V2_I]]
9298 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
9299   return vqadd_u16(a, b);
9300 }
9301
9302 // CHECK-LABEL: @test_vqadd_u32(
9303 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9304 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9305 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
9306 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
9307 // CHECK:   ret <2 x i32> [[VQADD_V2_I]]
9308 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
9309   return vqadd_u32(a, b);
9310 }
9311
9312 // CHECK-LABEL: @test_vqadd_u64(
9313 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9314 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
9315 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
9316 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
9317 // CHECK:   ret <1 x i64> [[VQADD_V2_I]]
9318 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
9319   return vqadd_u64(a, b);
9320 }
9321
9322 // CHECK-LABEL: @test_vqaddq_s8(
9323 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
9324 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
9325 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
9326   return vqaddq_s8(a, b);
9327 }
9328
9329 // CHECK-LABEL: @test_vqaddq_s16(
9330 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9331 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9332 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
9333 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
9334 // CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
9335 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
9336   return vqaddq_s16(a, b);
9337 }
9338
9339 // CHECK-LABEL: @test_vqaddq_s32(
9340 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9341 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9342 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
9343 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
9344 // CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
9345 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
9346   return vqaddq_s32(a, b);
9347 }
9348
9349 // CHECK-LABEL: @test_vqaddq_s64(
9350 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9351 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
9352 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
9353 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
9354 // CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
9355 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
9356   return vqaddq_s64(a, b);
9357 }
9358
9359 // CHECK-LABEL: @test_vqaddq_u8(
9360 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
9361 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
9362 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
9363   return vqaddq_u8(a, b);
9364 }
9365
9366 // CHECK-LABEL: @test_vqaddq_u16(
9367 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9368 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9369 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
9370 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
9371 // CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
9372 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
9373   return vqaddq_u16(a, b);
9374 }
9375
9376 // CHECK-LABEL: @test_vqaddq_u32(
9377 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9378 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9379 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
9380 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
9381 // CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
9382 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
9383   return vqaddq_u32(a, b);
9384 }
9385
9386 // CHECK-LABEL: @test_vqaddq_u64(
9387 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9388 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
9389 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
9390 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
9391 // CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
9392 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
9393   return vqaddq_u64(a, b);
9394 }
9395
9396 // CHECK-LABEL: @test_vqdmlal_s16(
9397 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9398 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9399 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9400 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
9401 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9402 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
9403 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9404   return vqdmlal_s16(a, b, c);
9405 }
9406
9407 // CHECK-LABEL: @test_vqdmlal_s32(
9408 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9409 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9410 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9411 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
9412 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9413 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
9414 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9415   return vqdmlal_s32(a, b, c);
9416 }
9417
9418 // CHECK-LABEL: @test_vqdmlal_lane_s16(
9419 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
9420 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9421 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9422 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
9423 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9424 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
9425 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
9426 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
9427 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
9428 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9429   return vqdmlal_lane_s16(a, b, c, 3);
9430 }
9431
9432 // CHECK-LABEL: @test_vqdmlal_lane_s32(
9433 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
9434 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9435 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
9436 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
9437 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9438 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
9439 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
9440 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
9441 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
9442 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9443   return vqdmlal_lane_s32(a, b, c, 1);
9444 }
9445
9446 // CHECK-LABEL: @test_vqdmlal_n_s16(
9447 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9448 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9449 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9450 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9451 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9452 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9453 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9454 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
9455 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
9456 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
9457 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9458   return vqdmlal_n_s16(a, b, c);
9459 }
9460
9461 // CHECK-LABEL: @test_vqdmlal_n_s32(
9462 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9463 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9464 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9465 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9466 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9467 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
9468 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
9469 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
9470 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9471   return vqdmlal_n_s32(a, b, c);
9472 }
9473
9474 // CHECK-LABEL: @test_vqdmlsl_s16(
9475 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9476 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9477 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9478 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
9479 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9480 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
9481 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9482   return vqdmlsl_s16(a, b, c);
9483 }
9484
9485 // CHECK-LABEL: @test_vqdmlsl_s32(
9486 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9487 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9488 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9489 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
9490 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9491 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
9492 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9493   return vqdmlsl_s32(a, b, c);
9494 }
9495
9496 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
9497 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
9498 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9499 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9500 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
9501 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9502 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
9503 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
9504 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
9505 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
9506 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9507   return vqdmlsl_lane_s16(a, b, c, 3);
9508 }
9509
9510 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
9511 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
9512 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9513 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
9514 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
9515 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9516 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
9517 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
9518 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
9519 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
9520 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9521   return vqdmlsl_lane_s32(a, b, c, 1);
9522 }
9523
9524 // CHECK-LABEL: @test_vqdmlsl_n_s16(
9525 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9526 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9527 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9528 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9529 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9530 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9531 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9532 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
9533 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
9534 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
9535 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9536   return vqdmlsl_n_s16(a, b, c);
9537 }
9538
9539 // CHECK-LABEL: @test_vqdmlsl_n_s32(
9540 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9541 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9542 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9543 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9544 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9545 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
9546 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
9547 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
9548 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9549   return vqdmlsl_n_s32(a, b, c);
9550 }
9551
9552 // CHECK-LABEL: @test_vqdmulh_s16(
9553 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9554 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9555 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
9556 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
9557 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
9558 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
9559   return vqdmulh_s16(a, b);
9560 }
9561
9562 // CHECK-LABEL: @test_vqdmulh_s32(
9563 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9564 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9565 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
9566 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
9567 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
9568 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
9569   return vqdmulh_s32(a, b);
9570 }
9571
9572 // CHECK-LABEL: @test_vqdmulhq_s16(
9573 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9574 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9575 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
9576 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
9577 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
9578 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
9579   return vqdmulhq_s16(a, b);
9580 }
9581
9582 // CHECK-LABEL: @test_vqdmulhq_s32(
9583 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9584 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9585 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
9586 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
9587 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
9588 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
9589   return vqdmulhq_s32(a, b);
9590 }
9591
9592 // CHECK-LABEL: @test_vqdmulh_lane_s16(
9593 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9594 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9595 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9596 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
9597 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
9598 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
9599 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
9600 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
9601 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
9602   return vqdmulh_lane_s16(a, b, 3);
9603 }
9604
9605 // CHECK-LABEL: @test_vqdmulh_lane_s32(
9606 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9607 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9608 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
9609 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
9610 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
9611 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
9612 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
9613 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
9614 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
9615   return vqdmulh_lane_s32(a, b, 1);
9616 }
9617
9618 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
9619 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9620 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9621 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9622 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
9623 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
9624 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
9625 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
9626 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
9627 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
9628   return vqdmulhq_lane_s16(a, b, 3);
9629 }
9630
9631 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
9632 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9633 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9634 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9635 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
9636 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
9637 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
9638 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
9639 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
9640 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
9641   return vqdmulhq_lane_s32(a, b, 1);
9642 }
9643
9644 // CHECK-LABEL: @test_vqdmulh_n_s16(
9645 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
9646 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
9647 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
9648 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
9649 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9650 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9651 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
9652 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
9653 // CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
9654 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
9655   return vqdmulh_n_s16(a, b);
9656 }
9657
9658 // CHECK-LABEL: @test_vqdmulh_n_s32(
9659 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
9660 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
9661 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9662 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9663 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
9664 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
9665 // CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
9666 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
9667   return vqdmulh_n_s32(a, b);
9668 }
9669
9670 // CHECK-LABEL: @test_vqdmulhq_n_s16(
9671 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
9672 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
9673 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
9674 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
9675 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
9676 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
9677 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
9678 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
9679 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9680 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
9681 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
9682 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
9683 // CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
9684 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
9685   return vqdmulhq_n_s16(a, b);
9686 }
9687
9688 // CHECK-LABEL: @test_vqdmulhq_n_s32(
9689 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
9690 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
9691 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
9692 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
9693 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9694 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
9695 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
9696 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
9697 // CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
9698 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
9699   return vqdmulhq_n_s32(a, b);
9700 }
9701
9702 // CHECK-LABEL: @test_vqdmull_s16(
9703 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9704 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9705 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
9706 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
9707 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
9708 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
9709   return vqdmull_s16(a, b);
9710 }
9711
9712 // CHECK-LABEL: @test_vqdmull_s32(
9713 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9714 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9715 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
9716 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
9717 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
9718 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
9719   return vqdmull_s32(a, b);
9720 }
9721
9722 // CHECK-LABEL: @test_vqdmull_lane_s16(
9723 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9724 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9725 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9726 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
9727 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
9728 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
9729 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
9730 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
9731 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
9732   return vqdmull_lane_s16(a, b, 3);
9733 }
9734
9735 // CHECK-LABEL: @test_vqdmull_lane_s32(
9736 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9737 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9738 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
9739 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
9740 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
9741 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
9742 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
9743 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
9744 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
9745   return vqdmull_lane_s32(a, b, 1);
9746 }
9747
9748 // CHECK-LABEL: @test_vqdmull_n_s16(
9749 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
9750 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
9751 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
9752 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
9753 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9754 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9755 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
9756 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
9757 // CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
9758 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
9759   return vqdmull_n_s16(a, b);
9760 }
9761
9762 // CHECK-LABEL: @test_vqdmull_n_s32(
9763 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
9764 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
9765 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9766 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9767 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
9768 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
9769 // CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
9770 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
9771   return vqdmull_n_s32(a, b);
9772 }
9773
9774 // CHECK-LABEL: @test_vqmovn_s16(
9775 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9776 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a)
9777 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
9778 int8x8_t test_vqmovn_s16(int16x8_t a) {
9779   return vqmovn_s16(a);
9780 }
9781
9782 // CHECK-LABEL: @test_vqmovn_s32(
9783 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9784 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a)
9785 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
9786 // CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
9787 int16x4_t test_vqmovn_s32(int32x4_t a) {
9788   return vqmovn_s32(a);
9789 }
9790
9791 // CHECK-LABEL: @test_vqmovn_s64(
9792 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9793 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a)
9794 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
9795 // CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
9796 int32x2_t test_vqmovn_s64(int64x2_t a) {
9797   return vqmovn_s64(a);
9798 }
9799
9800 // CHECK-LABEL: @test_vqmovn_u16(
9801 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9802 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a)
9803 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
9804 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
9805   return vqmovn_u16(a);
9806 }
9807
9808 // CHECK-LABEL: @test_vqmovn_u32(
9809 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9810 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a)
9811 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
9812 // CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
9813 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
9814   return vqmovn_u32(a);
9815 }
9816
9817 // CHECK-LABEL: @test_vqmovn_u64(
9818 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9819 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a)
9820 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
9821 // CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
9822 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
9823   return vqmovn_u64(a);
9824 }
9825
9826 // CHECK-LABEL: @test_vqmovun_s16(
9827 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9828 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a)
9829 // CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
9830 uint8x8_t test_vqmovun_s16(int16x8_t a) {
9831   return vqmovun_s16(a);
9832 }
9833
9834 // CHECK-LABEL: @test_vqmovun_s32(
9835 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9836 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a)
9837 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
9838 // CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
9839 uint16x4_t test_vqmovun_s32(int32x4_t a) {
9840   return vqmovun_s32(a);
9841 }
9842
9843 // CHECK-LABEL: @test_vqmovun_s64(
9844 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9845 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a)
9846 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
9847 // CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
9848 uint32x2_t test_vqmovun_s64(int64x2_t a) {
9849   return vqmovun_s64(a);
9850 }
9851
9852 // CHECK-LABEL: @test_vqneg_s8(
9853 // CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a)
9854 // CHECK:   ret <8 x i8> [[VQNEG_V_I]]
9855 int8x8_t test_vqneg_s8(int8x8_t a) {
9856   return vqneg_s8(a);
9857 }
9858
9859 // CHECK-LABEL: @test_vqneg_s16(
9860 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9861 // CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a)
9862 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
9863 // CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
9864 int16x4_t test_vqneg_s16(int16x4_t a) {
9865   return vqneg_s16(a);
9866 }
9867
9868 // CHECK-LABEL: @test_vqneg_s32(
9869 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9870 // CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a)
9871 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
9872 // CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
9873 int32x2_t test_vqneg_s32(int32x2_t a) {
9874   return vqneg_s32(a);
9875 }
9876
9877 // CHECK-LABEL: @test_vqnegq_s8(
9878 // CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a)
9879 // CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
9880 int8x16_t test_vqnegq_s8(int8x16_t a) {
9881   return vqnegq_s8(a);
9882 }
9883
9884 // CHECK-LABEL: @test_vqnegq_s16(
9885 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9886 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a)
9887 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
9888 // CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
9889 int16x8_t test_vqnegq_s16(int16x8_t a) {
9890   return vqnegq_s16(a);
9891 }
9892
9893 // CHECK-LABEL: @test_vqnegq_s32(
9894 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9895 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a)
9896 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
9897 // CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
9898 int32x4_t test_vqnegq_s32(int32x4_t a) {
9899   return vqnegq_s32(a);
9900 }
9901
9902 // CHECK-LABEL: @test_vqrdmulh_s16(
9903 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9904 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9905 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
9906 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
9907 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
9908 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
9909   return vqrdmulh_s16(a, b);
9910 }
9911
9912 // CHECK-LABEL: @test_vqrdmulh_s32(
9913 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9914 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9915 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
9916 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
9917 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
9918 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
9919   return vqrdmulh_s32(a, b);
9920 }
9921
9922 // CHECK-LABEL: @test_vqrdmulhq_s16(
9923 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9924 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9925 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
9926 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
9927 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
9928 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
9929   return vqrdmulhq_s16(a, b);
9930 }
9931
9932 // CHECK-LABEL: @test_vqrdmulhq_s32(
9933 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9934 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9935 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
9936 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
9937 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
9938 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
9939   return vqrdmulhq_s32(a, b);
9940 }
9941
9942 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
9943 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9944 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9945 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9946 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
9947 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
9948 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
9949 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
9950 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
9951 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
9952   return vqrdmulh_lane_s16(a, b, 3);
9953 }
9954
9955 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
9956 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9957 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9958 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
9959 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
9960 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
9961 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
9962 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
9963 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
9964 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
9965   return vqrdmulh_lane_s32(a, b, 1);
9966 }
9967
9968 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
9969 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
9970 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9971 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9972 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
9973 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
9974 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
9975 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
9976 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
9977 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
9978   return vqrdmulhq_lane_s16(a, b, 3);
9979 }
9980
9981 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
9982 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
9983 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9984 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9985 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
9986 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
9987 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
9988 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
9989 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
9990 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
9991   return vqrdmulhq_lane_s32(a, b, 1);
9992 }
9993
9994 // CHECK-LABEL: @test_vqrdmulh_n_s16(
9995 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
9996 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
9997 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
9998 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
9999 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10000 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10001 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
10002 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
10003 // CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
10004 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
10005   return vqrdmulh_n_s16(a, b);
10006 }
10007
10008 // CHECK-LABEL: @test_vqrdmulh_n_s32(
10009 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10010 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10011 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10012 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10013 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
10014 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
10015 // CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
10016 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
10017   return vqrdmulh_n_s32(a, b);
10018 }
10019
10020 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
10021 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10022 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10023 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10024 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10025 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10026 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10027 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10028 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10029 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10030 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
10031 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
10032 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
10033 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
10034 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
10035   return vqrdmulhq_n_s16(a, b);
10036 }
10037
10038 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
10039 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10040 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10041 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10042 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10043 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10044 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
10045 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
10046 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
10047 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
10048 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
10049   return vqrdmulhq_n_s32(a, b);
10050 }
10051
10052 // CHECK-LABEL: @test_vqrshl_s8(
10053 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
10054 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
10055 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
10056   return vqrshl_s8(a, b);
10057 }
10058
10059 // CHECK-LABEL: @test_vqrshl_s16(
10060 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10061 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10062 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
10063 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
10064 // CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
10065 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
10066   return vqrshl_s16(a, b);
10067 }
10068
10069 // CHECK-LABEL: @test_vqrshl_s32(
10070 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10071 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10072 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
10073 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
10074 // CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
10075 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
10076   return vqrshl_s32(a, b);
10077 }
10078
10079 // CHECK-LABEL: @test_vqrshl_s64(
10080 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10081 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10082 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
10083 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
10084 // CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
10085 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
10086   return vqrshl_s64(a, b);
10087 }
10088
10089 // CHECK-LABEL: @test_vqrshl_u8(
10090 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
10091 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
10092 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
10093   return vqrshl_u8(a, b);
10094 }
10095
10096 // CHECK-LABEL: @test_vqrshl_u16(
10097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10098 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10099 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
10100 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
10101 // CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
10102 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
10103   return vqrshl_u16(a, b);
10104 }
10105
10106 // CHECK-LABEL: @test_vqrshl_u32(
10107 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10108 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10109 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
10110 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
10111 // CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
10112 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
10113   return vqrshl_u32(a, b);
10114 }
10115
10116 // CHECK-LABEL: @test_vqrshl_u64(
10117 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10118 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10119 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
10120 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
10121 // CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
10122 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
10123   return vqrshl_u64(a, b);
10124 }
10125
10126 // CHECK-LABEL: @test_vqrshlq_s8(
10127 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
10128 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
10129 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
10130   return vqrshlq_s8(a, b);
10131 }
10132
10133 // CHECK-LABEL: @test_vqrshlq_s16(
10134 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10135 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10136 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
10137 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
10138 // CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
10139 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
10140   return vqrshlq_s16(a, b);
10141 }
10142
10143 // CHECK-LABEL: @test_vqrshlq_s32(
10144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10145 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10146 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
10147 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
10148 // CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
10149 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
10150   return vqrshlq_s32(a, b);
10151 }
10152
10153 // CHECK-LABEL: @test_vqrshlq_s64(
10154 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10155 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10156 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
10157 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
10158 // CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
10159 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
10160   return vqrshlq_s64(a, b);
10161 }
10162
10163 // CHECK-LABEL: @test_vqrshlq_u8(
10164 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
10165 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
10166 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
10167   return vqrshlq_u8(a, b);
10168 }
10169
10170 // CHECK-LABEL: @test_vqrshlq_u16(
10171 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10172 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10173 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
10174 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
10175 // CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
10176 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
10177   return vqrshlq_u16(a, b);
10178 }
10179
10180 // CHECK-LABEL: @test_vqrshlq_u32(
10181 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10182 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10183 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
10184 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
10185 // CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
10186 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
10187   return vqrshlq_u32(a, b);
10188 }
10189
10190 // CHECK-LABEL: @test_vqrshlq_u64(
10191 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10192 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10193 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
10194 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
10195 // CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
10196 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
10197   return vqrshlq_u64(a, b);
10198 }
10199
10200 // CHECK-LABEL: @test_vqrshrn_n_s16(
10201 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10202 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10203 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10204 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
10205 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
10206   return vqrshrn_n_s16(a, 1);
10207 }
10208
10209 // CHECK-LABEL: @test_vqrshrn_n_s32(
10210 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10211 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10212 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10213 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
10214 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
10215   return vqrshrn_n_s32(a, 1);
10216 }
10217
10218 // CHECK-LABEL: @test_vqrshrn_n_s64(
10219 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10220 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10221 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10222 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
10223 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
10224   return vqrshrn_n_s64(a, 1);
10225 }
10226
10227 // CHECK-LABEL: @test_vqrshrn_n_u16(
10228 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10229 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10230 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10231 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
10232 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
10233   return vqrshrn_n_u16(a, 1);
10234 }
10235
10236 // CHECK-LABEL: @test_vqrshrn_n_u32(
10237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10238 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10239 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10240 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
10241 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
10242   return vqrshrn_n_u32(a, 1);
10243 }
10244
10245 // CHECK-LABEL: @test_vqrshrn_n_u64(
10246 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10247 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10248 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10249 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
10250 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
10251   return vqrshrn_n_u64(a, 1);
10252 }
10253
10254 // CHECK-LABEL: @test_vqrshrun_n_s16(
10255 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10256 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10257 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10258 // CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
10259 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
10260   return vqrshrun_n_s16(a, 1);
10261 }
10262
10263 // CHECK-LABEL: @test_vqrshrun_n_s32(
10264 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10265 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10266 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10267 // CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
10268 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
10269   return vqrshrun_n_s32(a, 1);
10270 }
10271
10272 // CHECK-LABEL: @test_vqrshrun_n_s64(
10273 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10274 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10275 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
10276 // CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
10277 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
10278   return vqrshrun_n_s64(a, 1);
10279 }
10280
10281 // CHECK-LABEL: @test_vqshl_s8(
10282 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
10283 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
10284 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
10285   return vqshl_s8(a, b);
10286 }
10287
10288 // CHECK-LABEL: @test_vqshl_s16(
10289 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10290 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10291 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
10292 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
10293 // CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
10294 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
10295   return vqshl_s16(a, b);
10296 }
10297
10298 // CHECK-LABEL: @test_vqshl_s32(
10299 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10300 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10301 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
10302 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
10303 // CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
10304 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
10305   return vqshl_s32(a, b);
10306 }
10307
10308 // CHECK-LABEL: @test_vqshl_s64(
10309 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10310 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10311 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
10312 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
10313 // CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
10314 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
10315   return vqshl_s64(a, b);
10316 }
10317
10318 // CHECK-LABEL: @test_vqshl_u8(
10319 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
10320 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
10321 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
10322   return vqshl_u8(a, b);
10323 }
10324
10325 // CHECK-LABEL: @test_vqshl_u16(
10326 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10327 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10328 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
10329 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
10330 // CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
10331 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
10332   return vqshl_u16(a, b);
10333 }
10334
10335 // CHECK-LABEL: @test_vqshl_u32(
10336 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10337 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10338 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
10339 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
10340 // CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
10341 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
10342   return vqshl_u32(a, b);
10343 }
10344
10345 // CHECK-LABEL: @test_vqshl_u64(
10346 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10347 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10348 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
10349 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
10350 // CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
10351 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
10352   return vqshl_u64(a, b);
10353 }
10354
10355 // CHECK-LABEL: @test_vqshlq_s8(
10356 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
10357 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
10358 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
10359   return vqshlq_s8(a, b);
10360 }
10361
10362 // CHECK-LABEL: @test_vqshlq_s16(
10363 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10364 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10365 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
10366 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
10367 // CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
10368 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
10369   return vqshlq_s16(a, b);
10370 }
10371
10372 // CHECK-LABEL: @test_vqshlq_s32(
10373 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10374 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10375 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
10376 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
10377 // CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
10378 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
10379   return vqshlq_s32(a, b);
10380 }
10381
10382 // CHECK-LABEL: @test_vqshlq_s64(
10383 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10384 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10385 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
10386 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
10387 // CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
10388 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
10389   return vqshlq_s64(a, b);
10390 }
10391
10392 // CHECK-LABEL: @test_vqshlq_u8(
10393 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
10394 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
10395 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
10396   return vqshlq_u8(a, b);
10397 }
10398
10399 // CHECK-LABEL: @test_vqshlq_u16(
10400 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10401 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10402 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
10403 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
10404 // CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
10405 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
10406   return vqshlq_u16(a, b);
10407 }
10408
10409 // CHECK-LABEL: @test_vqshlq_u32(
10410 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10411 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10412 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
10413 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
10414 // CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
10415 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
10416   return vqshlq_u32(a, b);
10417 }
10418
10419 // CHECK-LABEL: @test_vqshlq_u64(
10420 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10421 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10422 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
10423 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
10424 // CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
10425 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
10426   return vqshlq_u64(a, b);
10427 }
10428
10429 // CHECK-LABEL: @test_vqshlu_n_s8(
10430 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10431 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
10432 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
10433   return vqshlu_n_s8(a, 1);
10434 }
10435
10436 // CHECK-LABEL: @test_vqshlu_n_s16(
10437 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10438 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10439 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10440 // CHECK:   ret <4 x i16> [[VQSHLU_N1]]
10441 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
10442   return vqshlu_n_s16(a, 1);
10443 }
10444
10445 // CHECK-LABEL: @test_vqshlu_n_s32(
10446 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10447 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10448 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
10449 // CHECK:   ret <2 x i32> [[VQSHLU_N1]]
10450 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
10451   return vqshlu_n_s32(a, 1);
10452 }
10453
10454 // CHECK-LABEL: @test_vqshlu_n_s64(
10455 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10456 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10457 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
10458 // CHECK:   ret <1 x i64> [[VQSHLU_N1]]
10459 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
10460   return vqshlu_n_s64(a, 1);
10461 }
10462
10463 // CHECK-LABEL: @test_vqshluq_n_s8(
10464 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10465 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
10466 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
10467   return vqshluq_n_s8(a, 1);
10468 }
10469
10470 // CHECK-LABEL: @test_vqshluq_n_s16(
10471 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10472 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10473 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10474 // CHECK:   ret <8 x i16> [[VQSHLU_N1]]
10475 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
10476   return vqshluq_n_s16(a, 1);
10477 }
10478
10479 // CHECK-LABEL: @test_vqshluq_n_s32(
10480 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10481 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10482 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10483 // CHECK:   ret <4 x i32> [[VQSHLU_N1]]
10484 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
10485   return vqshluq_n_s32(a, 1);
10486 }
10487
10488 // CHECK-LABEL: @test_vqshluq_n_s64(
10489 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10490 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10491 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
10492 // CHECK:   ret <2 x i64> [[VQSHLU_N1]]
10493 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
10494   return vqshluq_n_s64(a, 1);
10495 }
10496
10497 // CHECK-LABEL: @test_vqshl_n_s8(
10498 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10499 // CHECK:   ret <8 x i8> [[VQSHL_N]]
10500 int8x8_t test_vqshl_n_s8(int8x8_t a) {
10501   return vqshl_n_s8(a, 1);
10502 }
10503
10504 // CHECK-LABEL: @test_vqshl_n_s16(
10505 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10506 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10507 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10508 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
10509 int16x4_t test_vqshl_n_s16(int16x4_t a) {
10510   return vqshl_n_s16(a, 1);
10511 }
10512
10513 // CHECK-LABEL: @test_vqshl_n_s32(
10514 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10515 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10516 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
10517 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
10518 int32x2_t test_vqshl_n_s32(int32x2_t a) {
10519   return vqshl_n_s32(a, 1);
10520 }
10521
10522 // CHECK-LABEL: @test_vqshl_n_s64(
10523 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10524 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10525 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
10526 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
10527 int64x1_t test_vqshl_n_s64(int64x1_t a) {
10528   return vqshl_n_s64(a, 1);
10529 }
10530
10531 // CHECK-LABEL: @test_vqshl_n_u8(
10532 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10533 // CHECK:   ret <8 x i8> [[VQSHL_N]]
10534 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
10535   return vqshl_n_u8(a, 1);
10536 }
10537
10538 // CHECK-LABEL: @test_vqshl_n_u16(
10539 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10540 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10541 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10542 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
10543 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
10544   return vqshl_n_u16(a, 1);
10545 }
10546
10547 // CHECK-LABEL: @test_vqshl_n_u32(
10548 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10549 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10550 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
10551 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
10552 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
10553   return vqshl_n_u32(a, 1);
10554 }
10555
10556 // CHECK-LABEL: @test_vqshl_n_u64(
10557 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10558 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10559 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
10560 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
10561 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
10562   return vqshl_n_u64(a, 1);
10563 }
10564
10565 // CHECK-LABEL: @test_vqshlq_n_s8(
10566 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10567 // CHECK:   ret <16 x i8> [[VQSHL_N]]
10568 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
10569   return vqshlq_n_s8(a, 1);
10570 }
10571
10572 // CHECK-LABEL: @test_vqshlq_n_s16(
10573 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10574 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10575 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10576 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
10577 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
10578   return vqshlq_n_s16(a, 1);
10579 }
10580
10581 // CHECK-LABEL: @test_vqshlq_n_s32(
10582 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10583 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10584 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10585 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
10586 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
10587   return vqshlq_n_s32(a, 1);
10588 }
10589
10590 // CHECK-LABEL: @test_vqshlq_n_s64(
10591 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10592 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10593 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
10594 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
10595 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
10596   return vqshlq_n_s64(a, 1);
10597 }
10598
10599 // CHECK-LABEL: @test_vqshlq_n_u8(
10600 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10601 // CHECK:   ret <16 x i8> [[VQSHL_N]]
10602 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
10603   return vqshlq_n_u8(a, 1);
10604 }
10605
10606 // CHECK-LABEL: @test_vqshlq_n_u16(
10607 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10608 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10609 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10610 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
10611 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
10612   return vqshlq_n_u16(a, 1);
10613 }
10614
10615 // CHECK-LABEL: @test_vqshlq_n_u32(
10616 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10617 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10618 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10619 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
10620 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
10621   return vqshlq_n_u32(a, 1);
10622 }
10623
10624 // CHECK-LABEL: @test_vqshlq_n_u64(
10625 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10626 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10627 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
10628 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
10629 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
10630   return vqshlq_n_u64(a, 1);
10631 }
10632
10633 // CHECK-LABEL: @test_vqshrn_n_s16(
10634 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10635 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10636 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10637 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
10638 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
10639   return vqshrn_n_s16(a, 1);
10640 }
10641
10642 // CHECK-LABEL: @test_vqshrn_n_s32(
10643 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10644 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10645 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10646 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
10647 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
10648   return vqshrn_n_s32(a, 1);
10649 }
10650
10651 // CHECK-LABEL: @test_vqshrn_n_s64(
10652 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10653 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10654 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10655 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
10656 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
10657   return vqshrn_n_s64(a, 1);
10658 }
10659
10660 // CHECK-LABEL: @test_vqshrn_n_u16(
10661 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10662 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10663 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10664 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
10665 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
10666   return vqshrn_n_u16(a, 1);
10667 }
10668
10669 // CHECK-LABEL: @test_vqshrn_n_u32(
10670 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10671 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10672 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10673 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
10674 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
10675   return vqshrn_n_u32(a, 1);
10676 }
10677
10678 // CHECK-LABEL: @test_vqshrn_n_u64(
10679 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10680 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10681 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10682 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
10683 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
10684   return vqshrn_n_u64(a, 1);
10685 }
10686
10687 // CHECK-LABEL: @test_vqshrun_n_s16(
10688 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10689 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10690 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10691 // CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
10692 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
10693   return vqshrun_n_s16(a, 1);
10694 }
10695
10696 // CHECK-LABEL: @test_vqshrun_n_s32(
10697 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10698 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10699 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10700 // CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
10701 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
10702   return vqshrun_n_s32(a, 1);
10703 }
10704
10705 // CHECK-LABEL: @test_vqshrun_n_s64(
10706 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10707 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10708 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
10709 // CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
10710 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
10711   return vqshrun_n_s64(a, 1);
10712 }
10713
10714 // CHECK-LABEL: @test_vqsub_s8(
10715 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
10716 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
10717 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
10718   return vqsub_s8(a, b);
10719 }
10720
10721 // CHECK-LABEL: @test_vqsub_s16(
10722 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10723 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10724 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
10725 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
10726 // CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
10727 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
10728   return vqsub_s16(a, b);
10729 }
10730
10731 // CHECK-LABEL: @test_vqsub_s32(
10732 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10733 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10734 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
10735 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
10736 // CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
10737 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
10738   return vqsub_s32(a, b);
10739 }
10740
10741 // CHECK-LABEL: @test_vqsub_s64(
10742 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10743 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10744 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
10745 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
10746 // CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
10747 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
10748   return vqsub_s64(a, b);
10749 }
10750
10751 // CHECK-LABEL: @test_vqsub_u8(
10752 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
10753 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
10754 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
10755   return vqsub_u8(a, b);
10756 }
10757
10758 // CHECK-LABEL: @test_vqsub_u16(
10759 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10760 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10761 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
10762 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
10763 // CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
10764 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
10765   return vqsub_u16(a, b);
10766 }
10767
10768 // CHECK-LABEL: @test_vqsub_u32(
10769 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10770 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10771 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
10772 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
10773 // CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
10774 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
10775   return vqsub_u32(a, b);
10776 }
10777
10778 // CHECK-LABEL: @test_vqsub_u64(
10779 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10780 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10781 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
10782 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
10783 // CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
10784 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
10785   return vqsub_u64(a, b);
10786 }
10787
10788 // CHECK-LABEL: @test_vqsubq_s8(
10789 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
10790 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
10791 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
10792   return vqsubq_s8(a, b);
10793 }
10794
10795 // CHECK-LABEL: @test_vqsubq_s16(
10796 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10797 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10798 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
10799 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
10800 // CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
10801 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
10802   return vqsubq_s16(a, b);
10803 }
10804
10805 // CHECK-LABEL: @test_vqsubq_s32(
10806 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10807 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10808 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
10809 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
10810 // CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
10811 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
10812   return vqsubq_s32(a, b);
10813 }
10814
10815 // CHECK-LABEL: @test_vqsubq_s64(
10816 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10817 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10818 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
10819 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
10820 // CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
10821 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
10822   return vqsubq_s64(a, b);
10823 }
10824
10825 // CHECK-LABEL: @test_vqsubq_u8(
10826 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
10827 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
10828 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
10829   return vqsubq_u8(a, b);
10830 }
10831
10832 // CHECK-LABEL: @test_vqsubq_u16(
10833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10834 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10835 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
10836 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
10837 // CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
10838 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
10839   return vqsubq_u16(a, b);
10840 }
10841
10842 // CHECK-LABEL: @test_vqsubq_u32(
10843 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10844 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10845 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
10846 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
10847 // CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
10848 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
10849   return vqsubq_u32(a, b);
10850 }
10851
10852 // CHECK-LABEL: @test_vqsubq_u64(
10853 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10854 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10855 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
10856 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
10857 // CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
10858 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
10859   return vqsubq_u64(a, b);
10860 }
10861
10862 // CHECK-LABEL: @test_vraddhn_s16(
10863 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10864 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10865 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
10866 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
10867 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
10868   return vraddhn_s16(a, b);
10869 }
10870
10871 // CHECK-LABEL: @test_vraddhn_s32(
10872 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10873 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10874 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
10875 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
10876 // CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
10877 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
10878   return vraddhn_s32(a, b);
10879 }
10880
10881 // CHECK-LABEL: @test_vraddhn_s64(
10882 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10883 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10884 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
10885 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
10886 // CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
10887 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
10888   return vraddhn_s64(a, b);
10889 }
10890
10891 // CHECK-LABEL: @test_vraddhn_u16(
10892 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10893 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10894 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
10895 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
10896 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
10897   return vraddhn_u16(a, b);
10898 }
10899
10900 // CHECK-LABEL: @test_vraddhn_u32(
10901 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10902 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10903 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
10904 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
10905 // CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
10906 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
10907   return vraddhn_u32(a, b);
10908 }
10909
10910 // CHECK-LABEL: @test_vraddhn_u64(
10911 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10912 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10913 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
10914 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
10915 // CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
10916 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
10917   return vraddhn_u64(a, b);
10918 }
10919
10920 // CHECK-LABEL: @test_vrecpe_f32(
10921 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
10922 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a)
10923 // CHECK:   ret <2 x float> [[VRECPE_V1_I]]
10924 float32x2_t test_vrecpe_f32(float32x2_t a) {
10925   return vrecpe_f32(a);
10926 }
10927
10928 // CHECK-LABEL: @test_vrecpe_u32(
10929 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10930 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a)
10931 // CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
10932 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
10933   return vrecpe_u32(a);
10934 }
10935
10936 // CHECK-LABEL: @test_vrecpeq_f32(
10937 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
10938 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a)
10939 // CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
10940 float32x4_t test_vrecpeq_f32(float32x4_t a) {
10941   return vrecpeq_f32(a);
10942 }
10943
10944 // CHECK-LABEL: @test_vrecpeq_u32(
10945 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10946 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a)
10947 // CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
10948 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
10949   return vrecpeq_u32(a);
10950 }
10951
10952 // CHECK-LABEL: @test_vrecps_f32(
10953 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
10954 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
10955 // CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b)
10956 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
10957 // CHECK:   ret <2 x float> [[VRECPS_V2_I]]
10958 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
10959   return vrecps_f32(a, b);
10960 }
10961
10962 // CHECK-LABEL: @test_vrecpsq_f32(
10963 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
10964 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
10965 // CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b)
10966 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
10967 // CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
10968 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
10969   return vrecpsq_f32(a, b);
10970 }
10971
10972 // CHECK-LABEL: @test_vreinterpret_s8_s16(
10973 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10974 // CHECK:   ret <8 x i8> [[TMP0]]
10975 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
10976   return vreinterpret_s8_s16(a);
10977 }
10978
10979 // CHECK-LABEL: @test_vreinterpret_s8_s32(
10980 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10981 // CHECK:   ret <8 x i8> [[TMP0]]
10982 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
10983   return vreinterpret_s8_s32(a);
10984 }
10985
10986 // CHECK-LABEL: @test_vreinterpret_s8_s64(
10987 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10988 // CHECK:   ret <8 x i8> [[TMP0]]
10989 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
10990   return vreinterpret_s8_s64(a);
10991 }
10992
10993 // CHECK-LABEL: @test_vreinterpret_s8_u8(
10994 // CHECK:   ret <8 x i8> %a
10995 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
10996   return vreinterpret_s8_u8(a);
10997 }
10998
10999 // CHECK-LABEL: @test_vreinterpret_s8_u16(
11000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11001 // CHECK:   ret <8 x i8> [[TMP0]]
11002 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
11003   return vreinterpret_s8_u16(a);
11004 }
11005
11006 // CHECK-LABEL: @test_vreinterpret_s8_u32(
11007 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11008 // CHECK:   ret <8 x i8> [[TMP0]]
11009 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
11010   return vreinterpret_s8_u32(a);
11011 }
11012
11013 // CHECK-LABEL: @test_vreinterpret_s8_u64(
11014 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11015 // CHECK:   ret <8 x i8> [[TMP0]]
11016 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
11017   return vreinterpret_s8_u64(a);
11018 }
11019
11020 // CHECK-LABEL: @test_vreinterpret_s8_f16(
11021 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
11022 // CHECK:   ret <8 x i8> [[TMP0]]
11023 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
11024   return vreinterpret_s8_f16(a);
11025 }
11026
11027 // CHECK-LABEL: @test_vreinterpret_s8_f32(
11028 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11029 // CHECK:   ret <8 x i8> [[TMP0]]
11030 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
11031   return vreinterpret_s8_f32(a);
11032 }
11033
11034 // CHECK-LABEL: @test_vreinterpret_s8_p8(
11035 // CHECK:   ret <8 x i8> %a
11036 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
11037   return vreinterpret_s8_p8(a);
11038 }
11039
11040 // CHECK-LABEL: @test_vreinterpret_s8_p16(
11041 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11042 // CHECK:   ret <8 x i8> [[TMP0]]
11043 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
11044   return vreinterpret_s8_p16(a);
11045 }
11046
11047 // CHECK-LABEL: @test_vreinterpret_s16_s8(
11048 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11049 // CHECK:   ret <4 x i16> [[TMP0]]
11050 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
11051   return vreinterpret_s16_s8(a);
11052 }
11053
11054 // CHECK-LABEL: @test_vreinterpret_s16_s32(
11055 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11056 // CHECK:   ret <4 x i16> [[TMP0]]
11057 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
11058   return vreinterpret_s16_s32(a);
11059 }
11060
11061 // CHECK-LABEL: @test_vreinterpret_s16_s64(
11062 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11063 // CHECK:   ret <4 x i16> [[TMP0]]
11064 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
11065   return vreinterpret_s16_s64(a);
11066 }
11067
11068 // CHECK-LABEL: @test_vreinterpret_s16_u8(
11069 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11070 // CHECK:   ret <4 x i16> [[TMP0]]
11071 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
11072   return vreinterpret_s16_u8(a);
11073 }
11074
11075 // CHECK-LABEL: @test_vreinterpret_s16_u16(
11076 // CHECK:   ret <4 x i16> %a
11077 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
11078   return vreinterpret_s16_u16(a);
11079 }
11080
11081 // CHECK-LABEL: @test_vreinterpret_s16_u32(
11082 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11083 // CHECK:   ret <4 x i16> [[TMP0]]
11084 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
11085   return vreinterpret_s16_u32(a);
11086 }
11087
11088 // CHECK-LABEL: @test_vreinterpret_s16_u64(
11089 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11090 // CHECK:   ret <4 x i16> [[TMP0]]
11091 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
11092   return vreinterpret_s16_u64(a);
11093 }
11094
11095 // CHECK-LABEL: @test_vreinterpret_s16_f16(
11096 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
11097 // CHECK:   ret <4 x i16> [[TMP0]]
11098 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
11099   return vreinterpret_s16_f16(a);
11100 }
11101
11102 // CHECK-LABEL: @test_vreinterpret_s16_f32(
11103 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
11104 // CHECK:   ret <4 x i16> [[TMP0]]
11105 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
11106   return vreinterpret_s16_f32(a);
11107 }
11108
11109 // CHECK-LABEL: @test_vreinterpret_s16_p8(
11110 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11111 // CHECK:   ret <4 x i16> [[TMP0]]
11112 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
11113   return vreinterpret_s16_p8(a);
11114 }
11115
11116 // CHECK-LABEL: @test_vreinterpret_s16_p16(
11117 // CHECK:   ret <4 x i16> %a
11118 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
11119   return vreinterpret_s16_p16(a);
11120 }
11121
11122 // CHECK-LABEL: @test_vreinterpret_s32_s8(
11123 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11124 // CHECK:   ret <2 x i32> [[TMP0]]
11125 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
11126   return vreinterpret_s32_s8(a);
11127 }
11128
11129 // CHECK-LABEL: @test_vreinterpret_s32_s16(
11130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11131 // CHECK:   ret <2 x i32> [[TMP0]]
11132 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
11133   return vreinterpret_s32_s16(a);
11134 }
11135
11136 // CHECK-LABEL: @test_vreinterpret_s32_s64(
11137 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11138 // CHECK:   ret <2 x i32> [[TMP0]]
11139 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
11140   return vreinterpret_s32_s64(a);
11141 }
11142
11143 // CHECK-LABEL: @test_vreinterpret_s32_u8(
11144 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11145 // CHECK:   ret <2 x i32> [[TMP0]]
11146 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
11147   return vreinterpret_s32_u8(a);
11148 }
11149
11150 // CHECK-LABEL: @test_vreinterpret_s32_u16(
11151 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11152 // CHECK:   ret <2 x i32> [[TMP0]]
11153 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
11154   return vreinterpret_s32_u16(a);
11155 }
11156
11157 // CHECK-LABEL: @test_vreinterpret_s32_u32(
11158 // CHECK:   ret <2 x i32> %a
11159 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
11160   return vreinterpret_s32_u32(a);
11161 }
11162
11163 // CHECK-LABEL: @test_vreinterpret_s32_u64(
11164 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11165 // CHECK:   ret <2 x i32> [[TMP0]]
11166 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
11167   return vreinterpret_s32_u64(a);
11168 }
11169
11170 // CHECK-LABEL: @test_vreinterpret_s32_f16(
11171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
11172 // CHECK:   ret <2 x i32> [[TMP0]]
11173 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
11174   return vreinterpret_s32_f16(a);
11175 }
11176
11177 // CHECK-LABEL: @test_vreinterpret_s32_f32(
11178 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
11179 // CHECK:   ret <2 x i32> [[TMP0]]
11180 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
11181   return vreinterpret_s32_f32(a);
11182 }
11183
11184 // CHECK-LABEL: @test_vreinterpret_s32_p8(
11185 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11186 // CHECK:   ret <2 x i32> [[TMP0]]
11187 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
11188   return vreinterpret_s32_p8(a);
11189 }
11190
11191 // CHECK-LABEL: @test_vreinterpret_s32_p16(
11192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11193 // CHECK:   ret <2 x i32> [[TMP0]]
11194 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
11195   return vreinterpret_s32_p16(a);
11196 }
11197
11198 // CHECK-LABEL: @test_vreinterpret_s64_s8(
11199 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11200 // CHECK:   ret <1 x i64> [[TMP0]]
11201 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
11202   return vreinterpret_s64_s8(a);
11203 }
11204
11205 // CHECK-LABEL: @test_vreinterpret_s64_s16(
11206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11207 // CHECK:   ret <1 x i64> [[TMP0]]
11208 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
11209   return vreinterpret_s64_s16(a);
11210 }
11211
11212 // CHECK-LABEL: @test_vreinterpret_s64_s32(
11213 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11214 // CHECK:   ret <1 x i64> [[TMP0]]
11215 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
11216   return vreinterpret_s64_s32(a);
11217 }
11218
11219 // CHECK-LABEL: @test_vreinterpret_s64_u8(
11220 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11221 // CHECK:   ret <1 x i64> [[TMP0]]
11222 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
11223   return vreinterpret_s64_u8(a);
11224 }
11225
11226 // CHECK-LABEL: @test_vreinterpret_s64_u16(
11227 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11228 // CHECK:   ret <1 x i64> [[TMP0]]
11229 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
11230   return vreinterpret_s64_u16(a);
11231 }
11232
11233 // CHECK-LABEL: @test_vreinterpret_s64_u32(
11234 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11235 // CHECK:   ret <1 x i64> [[TMP0]]
11236 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
11237   return vreinterpret_s64_u32(a);
11238 }
11239
11240 // CHECK-LABEL: @test_vreinterpret_s64_u64(
11241 // CHECK:   ret <1 x i64> %a
11242 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
11243   return vreinterpret_s64_u64(a);
11244 }
11245
11246 // CHECK-LABEL: @test_vreinterpret_s64_f16(
11247 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
11248 // CHECK:   ret <1 x i64> [[TMP0]]
11249 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
11250   return vreinterpret_s64_f16(a);
11251 }
11252
11253 // CHECK-LABEL: @test_vreinterpret_s64_f32(
11254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
11255 // CHECK:   ret <1 x i64> [[TMP0]]
11256 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
11257   return vreinterpret_s64_f32(a);
11258 }
11259
11260 // CHECK-LABEL: @test_vreinterpret_s64_p8(
11261 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11262 // CHECK:   ret <1 x i64> [[TMP0]]
11263 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
11264   return vreinterpret_s64_p8(a);
11265 }
11266
11267 // CHECK-LABEL: @test_vreinterpret_s64_p16(
11268 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11269 // CHECK:   ret <1 x i64> [[TMP0]]
11270 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
11271   return vreinterpret_s64_p16(a);
11272 }
11273
11274 // CHECK-LABEL: @test_vreinterpret_u8_s8(
11275 // CHECK:   ret <8 x i8> %a
11276 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
11277   return vreinterpret_u8_s8(a);
11278 }
11279
11280 // CHECK-LABEL: @test_vreinterpret_u8_s16(
11281 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11282 // CHECK:   ret <8 x i8> [[TMP0]]
11283 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
11284   return vreinterpret_u8_s16(a);
11285 }
11286
11287 // CHECK-LABEL: @test_vreinterpret_u8_s32(
11288 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11289 // CHECK:   ret <8 x i8> [[TMP0]]
11290 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
11291   return vreinterpret_u8_s32(a);
11292 }
11293
11294 // CHECK-LABEL: @test_vreinterpret_u8_s64(
11295 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11296 // CHECK:   ret <8 x i8> [[TMP0]]
11297 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
11298   return vreinterpret_u8_s64(a);
11299 }
11300
11301 // CHECK-LABEL: @test_vreinterpret_u8_u16(
11302 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11303 // CHECK:   ret <8 x i8> [[TMP0]]
11304 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
11305   return vreinterpret_u8_u16(a);
11306 }
11307
11308 // CHECK-LABEL: @test_vreinterpret_u8_u32(
11309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11310 // CHECK:   ret <8 x i8> [[TMP0]]
11311 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
11312   return vreinterpret_u8_u32(a);
11313 }
11314
11315 // CHECK-LABEL: @test_vreinterpret_u8_u64(
11316 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11317 // CHECK:   ret <8 x i8> [[TMP0]]
11318 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
11319   return vreinterpret_u8_u64(a);
11320 }
11321
11322 // CHECK-LABEL: @test_vreinterpret_u8_f16(
11323 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
11324 // CHECK:   ret <8 x i8> [[TMP0]]
11325 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
11326   return vreinterpret_u8_f16(a);
11327 }
11328
11329 // CHECK-LABEL: @test_vreinterpret_u8_f32(
11330 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11331 // CHECK:   ret <8 x i8> [[TMP0]]
11332 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
11333   return vreinterpret_u8_f32(a);
11334 }
11335
11336 // CHECK-LABEL: @test_vreinterpret_u8_p8(
11337 // CHECK:   ret <8 x i8> %a
11338 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
11339   return vreinterpret_u8_p8(a);
11340 }
11341
11342 // CHECK-LABEL: @test_vreinterpret_u8_p16(
11343 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11344 // CHECK:   ret <8 x i8> [[TMP0]]
11345 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
11346   return vreinterpret_u8_p16(a);
11347 }
11348
11349 // CHECK-LABEL: @test_vreinterpret_u16_s8(
11350 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11351 // CHECK:   ret <4 x i16> [[TMP0]]
11352 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
11353   return vreinterpret_u16_s8(a);
11354 }
11355
11356 // CHECK-LABEL: @test_vreinterpret_u16_s16(
11357 // CHECK:   ret <4 x i16> %a
11358 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
11359   return vreinterpret_u16_s16(a);
11360 }
11361
11362 // CHECK-LABEL: @test_vreinterpret_u16_s32(
11363 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11364 // CHECK:   ret <4 x i16> [[TMP0]]
11365 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
11366   return vreinterpret_u16_s32(a);
11367 }
11368
11369 // CHECK-LABEL: @test_vreinterpret_u16_s64(
11370 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11371 // CHECK:   ret <4 x i16> [[TMP0]]
11372 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
11373   return vreinterpret_u16_s64(a);
11374 }
11375
11376 // CHECK-LABEL: @test_vreinterpret_u16_u8(
11377 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11378 // CHECK:   ret <4 x i16> [[TMP0]]
11379 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
11380   return vreinterpret_u16_u8(a);
11381 }
11382
11383 // CHECK-LABEL: @test_vreinterpret_u16_u32(
11384 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11385 // CHECK:   ret <4 x i16> [[TMP0]]
11386 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
11387   return vreinterpret_u16_u32(a);
11388 }
11389
11390 // CHECK-LABEL: @test_vreinterpret_u16_u64(
11391 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11392 // CHECK:   ret <4 x i16> [[TMP0]]
11393 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
11394   return vreinterpret_u16_u64(a);
11395 }
11396
11397 // CHECK-LABEL: @test_vreinterpret_u16_f16(
11398 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
11399 // CHECK:   ret <4 x i16> [[TMP0]]
11400 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
11401   return vreinterpret_u16_f16(a);
11402 }
11403
11404 // CHECK-LABEL: @test_vreinterpret_u16_f32(
11405 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
11406 // CHECK:   ret <4 x i16> [[TMP0]]
11407 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
11408   return vreinterpret_u16_f32(a);
11409 }
11410
11411 // CHECK-LABEL: @test_vreinterpret_u16_p8(
11412 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11413 // CHECK:   ret <4 x i16> [[TMP0]]
11414 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
11415   return vreinterpret_u16_p8(a);
11416 }
11417
11418 // CHECK-LABEL: @test_vreinterpret_u16_p16(
11419 // CHECK:   ret <4 x i16> %a
11420 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
11421   return vreinterpret_u16_p16(a);
11422 }
11423
11424 // CHECK-LABEL: @test_vreinterpret_u32_s8(
11425 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11426 // CHECK:   ret <2 x i32> [[TMP0]]
11427 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
11428   return vreinterpret_u32_s8(a);
11429 }
11430
11431 // CHECK-LABEL: @test_vreinterpret_u32_s16(
11432 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11433 // CHECK:   ret <2 x i32> [[TMP0]]
11434 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
11435   return vreinterpret_u32_s16(a);
11436 }
11437
11438 // CHECK-LABEL: @test_vreinterpret_u32_s32(
11439 // CHECK:   ret <2 x i32> %a
11440 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
11441   return vreinterpret_u32_s32(a);
11442 }
11443
11444 // CHECK-LABEL: @test_vreinterpret_u32_s64(
11445 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11446 // CHECK:   ret <2 x i32> [[TMP0]]
11447 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
11448   return vreinterpret_u32_s64(a);
11449 }
11450
11451 // CHECK-LABEL: @test_vreinterpret_u32_u8(
11452 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11453 // CHECK:   ret <2 x i32> [[TMP0]]
11454 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
11455   return vreinterpret_u32_u8(a);
11456 }
11457
11458 // CHECK-LABEL: @test_vreinterpret_u32_u16(
11459 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11460 // CHECK:   ret <2 x i32> [[TMP0]]
11461 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
11462   return vreinterpret_u32_u16(a);
11463 }
11464
11465 // CHECK-LABEL: @test_vreinterpret_u32_u64(
11466 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11467 // CHECK:   ret <2 x i32> [[TMP0]]
11468 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
11469   return vreinterpret_u32_u64(a);
11470 }
11471
11472 // CHECK-LABEL: @test_vreinterpret_u32_f16(
11473 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
11474 // CHECK:   ret <2 x i32> [[TMP0]]
11475 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
11476   return vreinterpret_u32_f16(a);
11477 }
11478
11479 // CHECK-LABEL: @test_vreinterpret_u32_f32(
11480 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
11481 // CHECK:   ret <2 x i32> [[TMP0]]
11482 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
11483   return vreinterpret_u32_f32(a);
11484 }
11485
11486 // CHECK-LABEL: @test_vreinterpret_u32_p8(
11487 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11488 // CHECK:   ret <2 x i32> [[TMP0]]
11489 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
11490   return vreinterpret_u32_p8(a);
11491 }
11492
11493 // CHECK-LABEL: @test_vreinterpret_u32_p16(
11494 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11495 // CHECK:   ret <2 x i32> [[TMP0]]
11496 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
11497   return vreinterpret_u32_p16(a);
11498 }
11499
11500 // CHECK-LABEL: @test_vreinterpret_u64_s8(
11501 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11502 // CHECK:   ret <1 x i64> [[TMP0]]
11503 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
11504   return vreinterpret_u64_s8(a);
11505 }
11506
11507 // CHECK-LABEL: @test_vreinterpret_u64_s16(
11508 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11509 // CHECK:   ret <1 x i64> [[TMP0]]
11510 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
11511   return vreinterpret_u64_s16(a);
11512 }
11513
11514 // CHECK-LABEL: @test_vreinterpret_u64_s32(
11515 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11516 // CHECK:   ret <1 x i64> [[TMP0]]
11517 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
11518   return vreinterpret_u64_s32(a);
11519 }
11520
11521 // CHECK-LABEL: @test_vreinterpret_u64_s64(
11522 // CHECK:   ret <1 x i64> %a
11523 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
11524   return vreinterpret_u64_s64(a);
11525 }
11526
11527 // CHECK-LABEL: @test_vreinterpret_u64_u8(
11528 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11529 // CHECK:   ret <1 x i64> [[TMP0]]
11530 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
11531   return vreinterpret_u64_u8(a);
11532 }
11533
11534 // CHECK-LABEL: @test_vreinterpret_u64_u16(
11535 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11536 // CHECK:   ret <1 x i64> [[TMP0]]
11537 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
11538   return vreinterpret_u64_u16(a);
11539 }
11540
11541 // CHECK-LABEL: @test_vreinterpret_u64_u32(
11542 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11543 // CHECK:   ret <1 x i64> [[TMP0]]
11544 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
11545   return vreinterpret_u64_u32(a);
11546 }
11547
11548 // CHECK-LABEL: @test_vreinterpret_u64_f16(
11549 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
11550 // CHECK:   ret <1 x i64> [[TMP0]]
11551 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
11552   return vreinterpret_u64_f16(a);
11553 }
11554
11555 // CHECK-LABEL: @test_vreinterpret_u64_f32(
11556 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
11557 // CHECK:   ret <1 x i64> [[TMP0]]
11558 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
11559   return vreinterpret_u64_f32(a);
11560 }
11561
11562 // CHECK-LABEL: @test_vreinterpret_u64_p8(
11563 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11564 // CHECK:   ret <1 x i64> [[TMP0]]
11565 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
11566   return vreinterpret_u64_p8(a);
11567 }
11568
11569 // CHECK-LABEL: @test_vreinterpret_u64_p16(
11570 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11571 // CHECK:   ret <1 x i64> [[TMP0]]
11572 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
11573   return vreinterpret_u64_p16(a);
11574 }
11575
11576 // CHECK-LABEL: @test_vreinterpret_f16_s8(
11577 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11578 // CHECK:   ret <4 x half> [[TMP0]]
11579 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
11580   return vreinterpret_f16_s8(a);
11581 }
11582
11583 // CHECK-LABEL: @test_vreinterpret_f16_s16(
11584 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11585 // CHECK:   ret <4 x half> [[TMP0]]
11586 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
11587   return vreinterpret_f16_s16(a);
11588 }
11589
11590 // CHECK-LABEL: @test_vreinterpret_f16_s32(
11591 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
11592 // CHECK:   ret <4 x half> [[TMP0]]
11593 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
11594   return vreinterpret_f16_s32(a);
11595 }
11596
11597 // CHECK-LABEL: @test_vreinterpret_f16_s64(
11598 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
11599 // CHECK:   ret <4 x half> [[TMP0]]
11600 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
11601   return vreinterpret_f16_s64(a);
11602 }
11603
11604 // CHECK-LABEL: @test_vreinterpret_f16_u8(
11605 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11606 // CHECK:   ret <4 x half> [[TMP0]]
11607 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
11608   return vreinterpret_f16_u8(a);
11609 }
11610
11611 // CHECK-LABEL: @test_vreinterpret_f16_u16(
11612 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11613 // CHECK:   ret <4 x half> [[TMP0]]
11614 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
11615   return vreinterpret_f16_u16(a);
11616 }
11617
11618 // CHECK-LABEL: @test_vreinterpret_f16_u32(
11619 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
11620 // CHECK:   ret <4 x half> [[TMP0]]
11621 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
11622   return vreinterpret_f16_u32(a);
11623 }
11624
11625 // CHECK-LABEL: @test_vreinterpret_f16_u64(
11626 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
11627 // CHECK:   ret <4 x half> [[TMP0]]
11628 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
11629   return vreinterpret_f16_u64(a);
11630 }
11631
11632 // CHECK-LABEL: @test_vreinterpret_f16_f32(
11633 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
11634 // CHECK:   ret <4 x half> [[TMP0]]
11635 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
11636   return vreinterpret_f16_f32(a);
11637 }
11638
11639 // CHECK-LABEL: @test_vreinterpret_f16_p8(
11640 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11641 // CHECK:   ret <4 x half> [[TMP0]]
11642 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
11643   return vreinterpret_f16_p8(a);
11644 }
11645
11646 // CHECK-LABEL: @test_vreinterpret_f16_p16(
11647 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11648 // CHECK:   ret <4 x half> [[TMP0]]
11649 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
11650   return vreinterpret_f16_p16(a);
11651 }
11652
11653 // CHECK-LABEL: @test_vreinterpret_f32_s8(
11654 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
11655 // CHECK:   ret <2 x float> [[TMP0]]
11656 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
11657   return vreinterpret_f32_s8(a);
11658 }
11659
11660 // CHECK-LABEL: @test_vreinterpret_f32_s16(
11661 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
11662 // CHECK:   ret <2 x float> [[TMP0]]
11663 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
11664   return vreinterpret_f32_s16(a);
11665 }
11666
11667 // CHECK-LABEL: @test_vreinterpret_f32_s32(
11668 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
11669 // CHECK:   ret <2 x float> [[TMP0]]
11670 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
11671   return vreinterpret_f32_s32(a);
11672 }
11673
11674 // CHECK-LABEL: @test_vreinterpret_f32_s64(
11675 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
11676 // CHECK:   ret <2 x float> [[TMP0]]
11677 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
11678   return vreinterpret_f32_s64(a);
11679 }
11680
11681 // CHECK-LABEL: @test_vreinterpret_f32_u8(
11682 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
11683 // CHECK:   ret <2 x float> [[TMP0]]
11684 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
11685   return vreinterpret_f32_u8(a);
11686 }
11687
11688 // CHECK-LABEL: @test_vreinterpret_f32_u16(
11689 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
11690 // CHECK:   ret <2 x float> [[TMP0]]
11691 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
11692   return vreinterpret_f32_u16(a);
11693 }
11694
11695 // CHECK-LABEL: @test_vreinterpret_f32_u32(
11696 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
11697 // CHECK:   ret <2 x float> [[TMP0]]
11698 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
11699   return vreinterpret_f32_u32(a);
11700 }
11701
11702 // CHECK-LABEL: @test_vreinterpret_f32_u64(
11703 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
11704 // CHECK:   ret <2 x float> [[TMP0]]
11705 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
11706   return vreinterpret_f32_u64(a);
11707 }
11708
11709 // CHECK-LABEL: @test_vreinterpret_f32_f16(
11710 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
11711 // CHECK:   ret <2 x float> [[TMP0]]
11712 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
11713   return vreinterpret_f32_f16(a);
11714 }
11715
11716 // CHECK-LABEL: @test_vreinterpret_f32_p8(
11717 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
11718 // CHECK:   ret <2 x float> [[TMP0]]
11719 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
11720   return vreinterpret_f32_p8(a);
11721 }
11722
11723 // CHECK-LABEL: @test_vreinterpret_f32_p16(
11724 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
11725 // CHECK:   ret <2 x float> [[TMP0]]
11726 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
11727   return vreinterpret_f32_p16(a);
11728 }
11729
11730 // CHECK-LABEL: @test_vreinterpret_p8_s8(
11731 // CHECK:   ret <8 x i8> %a
11732 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
11733   return vreinterpret_p8_s8(a);
11734 }
11735
11736 // CHECK-LABEL: @test_vreinterpret_p8_s16(
11737 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11738 // CHECK:   ret <8 x i8> [[TMP0]]
11739 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
11740   return vreinterpret_p8_s16(a);
11741 }
11742
11743 // CHECK-LABEL: @test_vreinterpret_p8_s32(
11744 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11745 // CHECK:   ret <8 x i8> [[TMP0]]
11746 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
11747   return vreinterpret_p8_s32(a);
11748 }
11749
11750 // CHECK-LABEL: @test_vreinterpret_p8_s64(
11751 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11752 // CHECK:   ret <8 x i8> [[TMP0]]
11753 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
11754   return vreinterpret_p8_s64(a);
11755 }
11756
11757 // CHECK-LABEL: @test_vreinterpret_p8_u8(
11758 // CHECK:   ret <8 x i8> %a
11759 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
11760   return vreinterpret_p8_u8(a);
11761 }
11762
11763 // CHECK-LABEL: @test_vreinterpret_p8_u16(
11764 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11765 // CHECK:   ret <8 x i8> [[TMP0]]
11766 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
11767   return vreinterpret_p8_u16(a);
11768 }
11769
11770 // CHECK-LABEL: @test_vreinterpret_p8_u32(
11771 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11772 // CHECK:   ret <8 x i8> [[TMP0]]
11773 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
11774   return vreinterpret_p8_u32(a);
11775 }
11776
11777 // CHECK-LABEL: @test_vreinterpret_p8_u64(
11778 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11779 // CHECK:   ret <8 x i8> [[TMP0]]
11780 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
11781   return vreinterpret_p8_u64(a);
11782 }
11783
11784 // CHECK-LABEL: @test_vreinterpret_p8_f16(
11785 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
11786 // CHECK:   ret <8 x i8> [[TMP0]]
11787 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
11788   return vreinterpret_p8_f16(a);
11789 }
11790
11791 // CHECK-LABEL: @test_vreinterpret_p8_f32(
11792 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11793 // CHECK:   ret <8 x i8> [[TMP0]]
11794 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
11795   return vreinterpret_p8_f32(a);
11796 }
11797
11798 // CHECK-LABEL: @test_vreinterpret_p8_p16(
11799 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11800 // CHECK:   ret <8 x i8> [[TMP0]]
11801 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
11802   return vreinterpret_p8_p16(a);
11803 }
11804
11805 // CHECK-LABEL: @test_vreinterpret_p16_s8(
11806 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11807 // CHECK:   ret <4 x i16> [[TMP0]]
11808 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
11809   return vreinterpret_p16_s8(a);
11810 }
11811
11812 // CHECK-LABEL: @test_vreinterpret_p16_s16(
11813 // CHECK:   ret <4 x i16> %a
11814 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
11815   return vreinterpret_p16_s16(a);
11816 }
11817
11818 // CHECK-LABEL: @test_vreinterpret_p16_s32(
11819 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11820 // CHECK:   ret <4 x i16> [[TMP0]]
11821 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
11822   return vreinterpret_p16_s32(a);
11823 }
11824
11825 // CHECK-LABEL: @test_vreinterpret_p16_s64(
11826 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11827 // CHECK:   ret <4 x i16> [[TMP0]]
11828 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
11829   return vreinterpret_p16_s64(a);
11830 }
11831
11832 // CHECK-LABEL: @test_vreinterpret_p16_u8(
11833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11834 // CHECK:   ret <4 x i16> [[TMP0]]
11835 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
11836   return vreinterpret_p16_u8(a);
11837 }
11838
11839 // CHECK-LABEL: @test_vreinterpret_p16_u16(
11840 // CHECK:   ret <4 x i16> %a
11841 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
11842   return vreinterpret_p16_u16(a);
11843 }
11844
11845 // CHECK-LABEL: @test_vreinterpret_p16_u32(
11846 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11847 // CHECK:   ret <4 x i16> [[TMP0]]
11848 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
11849   return vreinterpret_p16_u32(a);
11850 }
11851
11852 // CHECK-LABEL: @test_vreinterpret_p16_u64(
11853 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11854 // CHECK:   ret <4 x i16> [[TMP0]]
11855 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
11856   return vreinterpret_p16_u64(a);
11857 }
11858
11859 // CHECK-LABEL: @test_vreinterpret_p16_f16(
11860 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
11861 // CHECK:   ret <4 x i16> [[TMP0]]
11862 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
11863   return vreinterpret_p16_f16(a);
11864 }
11865
11866 // CHECK-LABEL: @test_vreinterpret_p16_f32(
11867 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
11868 // CHECK:   ret <4 x i16> [[TMP0]]
11869 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
11870   return vreinterpret_p16_f32(a);
11871 }
11872
11873 // CHECK-LABEL: @test_vreinterpret_p16_p8(
11874 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11875 // CHECK:   ret <4 x i16> [[TMP0]]
11876 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
11877   return vreinterpret_p16_p8(a);
11878 }
11879
11880 // CHECK-LABEL: @test_vreinterpretq_s8_s16(
11881 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11882 // CHECK:   ret <16 x i8> [[TMP0]]
11883 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
11884   return vreinterpretq_s8_s16(a);
11885 }
11886
11887 // CHECK-LABEL: @test_vreinterpretq_s8_s32(
11888 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11889 // CHECK:   ret <16 x i8> [[TMP0]]
11890 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
11891   return vreinterpretq_s8_s32(a);
11892 }
11893
11894 // CHECK-LABEL: @test_vreinterpretq_s8_s64(
11895 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11896 // CHECK:   ret <16 x i8> [[TMP0]]
11897 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
11898   return vreinterpretq_s8_s64(a);
11899 }
11900
11901 // CHECK-LABEL: @test_vreinterpretq_s8_u8(
11902 // CHECK:   ret <16 x i8> %a
11903 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
11904   return vreinterpretq_s8_u8(a);
11905 }
11906
11907 // CHECK-LABEL: @test_vreinterpretq_s8_u16(
11908 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11909 // CHECK:   ret <16 x i8> [[TMP0]]
11910 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
11911   return vreinterpretq_s8_u16(a);
11912 }
11913
11914 // CHECK-LABEL: @test_vreinterpretq_s8_u32(
11915 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11916 // CHECK:   ret <16 x i8> [[TMP0]]
11917 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
11918   return vreinterpretq_s8_u32(a);
11919 }
11920
11921 // CHECK-LABEL: @test_vreinterpretq_s8_u64(
11922 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11923 // CHECK:   ret <16 x i8> [[TMP0]]
11924 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
11925   return vreinterpretq_s8_u64(a);
11926 }
11927
11928 // CHECK-LABEL: @test_vreinterpretq_s8_f16(
11929 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
11930 // CHECK:   ret <16 x i8> [[TMP0]]
11931 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
11932   return vreinterpretq_s8_f16(a);
11933 }
11934
11935 // CHECK-LABEL: @test_vreinterpretq_s8_f32(
11936 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
11937 // CHECK:   ret <16 x i8> [[TMP0]]
11938 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
11939   return vreinterpretq_s8_f32(a);
11940 }
11941
11942 // CHECK-LABEL: @test_vreinterpretq_s8_p8(
11943 // CHECK:   ret <16 x i8> %a
11944 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
11945   return vreinterpretq_s8_p8(a);
11946 }
11947
11948 // CHECK-LABEL: @test_vreinterpretq_s8_p16(
11949 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11950 // CHECK:   ret <16 x i8> [[TMP0]]
11951 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
11952   return vreinterpretq_s8_p16(a);
11953 }
11954
11955 // CHECK-LABEL: @test_vreinterpretq_s16_s8(
11956 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
11957 // CHECK:   ret <8 x i16> [[TMP0]]
11958 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
11959   return vreinterpretq_s16_s8(a);
11960 }
11961
11962 // CHECK-LABEL: @test_vreinterpretq_s16_s32(
11963 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
11964 // CHECK:   ret <8 x i16> [[TMP0]]
11965 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
11966   return vreinterpretq_s16_s32(a);
11967 }
11968
11969 // CHECK-LABEL: @test_vreinterpretq_s16_s64(
11970 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
11971 // CHECK:   ret <8 x i16> [[TMP0]]
11972 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
11973   return vreinterpretq_s16_s64(a);
11974 }
11975
11976 // CHECK-LABEL: @test_vreinterpretq_s16_u8(
11977 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
11978 // CHECK:   ret <8 x i16> [[TMP0]]
11979 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
11980   return vreinterpretq_s16_u8(a);
11981 }
11982
11983 // CHECK-LABEL: @test_vreinterpretq_s16_u16(
11984 // CHECK:   ret <8 x i16> %a
11985 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
11986   return vreinterpretq_s16_u16(a);
11987 }
11988
11989 // CHECK-LABEL: @test_vreinterpretq_s16_u32(
11990 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
11991 // CHECK:   ret <8 x i16> [[TMP0]]
11992 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
11993   return vreinterpretq_s16_u32(a);
11994 }
11995
11996 // CHECK-LABEL: @test_vreinterpretq_s16_u64(
11997 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
11998 // CHECK:   ret <8 x i16> [[TMP0]]
11999 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
12000   return vreinterpretq_s16_u64(a);
12001 }
12002
12003 // CHECK-LABEL: @test_vreinterpretq_s16_f16(
12004 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
12005 // CHECK:   ret <8 x i16> [[TMP0]]
12006 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
12007   return vreinterpretq_s16_f16(a);
12008 }
12009
12010 // CHECK-LABEL: @test_vreinterpretq_s16_f32(
12011 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
12012 // CHECK:   ret <8 x i16> [[TMP0]]
12013 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
12014   return vreinterpretq_s16_f32(a);
12015 }
12016
12017 // CHECK-LABEL: @test_vreinterpretq_s16_p8(
12018 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12019 // CHECK:   ret <8 x i16> [[TMP0]]
12020 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
12021   return vreinterpretq_s16_p8(a);
12022 }
12023
12024 // CHECK-LABEL: @test_vreinterpretq_s16_p16(
12025 // CHECK:   ret <8 x i16> %a
12026 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
12027   return vreinterpretq_s16_p16(a);
12028 }
12029
12030 // CHECK-LABEL: @test_vreinterpretq_s32_s8(
12031 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12032 // CHECK:   ret <4 x i32> [[TMP0]]
12033 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
12034   return vreinterpretq_s32_s8(a);
12035 }
12036
12037 // CHECK-LABEL: @test_vreinterpretq_s32_s16(
12038 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12039 // CHECK:   ret <4 x i32> [[TMP0]]
12040 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
12041   return vreinterpretq_s32_s16(a);
12042 }
12043
12044 // CHECK-LABEL: @test_vreinterpretq_s32_s64(
12045 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12046 // CHECK:   ret <4 x i32> [[TMP0]]
12047 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
12048   return vreinterpretq_s32_s64(a);
12049 }
12050
12051 // CHECK-LABEL: @test_vreinterpretq_s32_u8(
12052 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12053 // CHECK:   ret <4 x i32> [[TMP0]]
12054 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
12055   return vreinterpretq_s32_u8(a);
12056 }
12057
12058 // CHECK-LABEL: @test_vreinterpretq_s32_u16(
12059 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12060 // CHECK:   ret <4 x i32> [[TMP0]]
12061 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
12062   return vreinterpretq_s32_u16(a);
12063 }
12064
12065 // CHECK-LABEL: @test_vreinterpretq_s32_u32(
12066 // CHECK:   ret <4 x i32> %a
12067 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
12068   return vreinterpretq_s32_u32(a);
12069 }
12070
12071 // CHECK-LABEL: @test_vreinterpretq_s32_u64(
12072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12073 // CHECK:   ret <4 x i32> [[TMP0]]
12074 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
12075   return vreinterpretq_s32_u64(a);
12076 }
12077
12078 // CHECK-LABEL: @test_vreinterpretq_s32_f16(
12079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
12080 // CHECK:   ret <4 x i32> [[TMP0]]
12081 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
12082   return vreinterpretq_s32_f16(a);
12083 }
12084
12085 // CHECK-LABEL: @test_vreinterpretq_s32_f32(
12086 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
12087 // CHECK:   ret <4 x i32> [[TMP0]]
12088 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
12089   return vreinterpretq_s32_f32(a);
12090 }
12091
12092 // CHECK-LABEL: @test_vreinterpretq_s32_p8(
12093 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12094 // CHECK:   ret <4 x i32> [[TMP0]]
12095 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
12096   return vreinterpretq_s32_p8(a);
12097 }
12098
12099 // CHECK-LABEL: @test_vreinterpretq_s32_p16(
12100 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12101 // CHECK:   ret <4 x i32> [[TMP0]]
12102 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
12103   return vreinterpretq_s32_p16(a);
12104 }
12105
12106 // CHECK-LABEL: @test_vreinterpretq_s64_s8(
12107 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12108 // CHECK:   ret <2 x i64> [[TMP0]]
12109 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
12110   return vreinterpretq_s64_s8(a);
12111 }
12112
12113 // CHECK-LABEL: @test_vreinterpretq_s64_s16(
12114 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12115 // CHECK:   ret <2 x i64> [[TMP0]]
12116 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
12117   return vreinterpretq_s64_s16(a);
12118 }
12119
12120 // CHECK-LABEL: @test_vreinterpretq_s64_s32(
12121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12122 // CHECK:   ret <2 x i64> [[TMP0]]
12123 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
12124   return vreinterpretq_s64_s32(a);
12125 }
12126
12127 // CHECK-LABEL: @test_vreinterpretq_s64_u8(
12128 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12129 // CHECK:   ret <2 x i64> [[TMP0]]
12130 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
12131   return vreinterpretq_s64_u8(a);
12132 }
12133
12134 // CHECK-LABEL: @test_vreinterpretq_s64_u16(
12135 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12136 // CHECK:   ret <2 x i64> [[TMP0]]
12137 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
12138   return vreinterpretq_s64_u16(a);
12139 }
12140
12141 // CHECK-LABEL: @test_vreinterpretq_s64_u32(
12142 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12143 // CHECK:   ret <2 x i64> [[TMP0]]
12144 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
12145   return vreinterpretq_s64_u32(a);
12146 }
12147
12148 // CHECK-LABEL: @test_vreinterpretq_s64_u64(
12149 // CHECK:   ret <2 x i64> %a
12150 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
12151   return vreinterpretq_s64_u64(a);
12152 }
12153
12154 // CHECK-LABEL: @test_vreinterpretq_s64_f16(
12155 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
12156 // CHECK:   ret <2 x i64> [[TMP0]]
12157 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
12158   return vreinterpretq_s64_f16(a);
12159 }
12160
12161 // CHECK-LABEL: @test_vreinterpretq_s64_f32(
12162 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
12163 // CHECK:   ret <2 x i64> [[TMP0]]
12164 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
12165   return vreinterpretq_s64_f32(a);
12166 }
12167
12168 // CHECK-LABEL: @test_vreinterpretq_s64_p8(
12169 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12170 // CHECK:   ret <2 x i64> [[TMP0]]
12171 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
12172   return vreinterpretq_s64_p8(a);
12173 }
12174
12175 // CHECK-LABEL: @test_vreinterpretq_s64_p16(
12176 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12177 // CHECK:   ret <2 x i64> [[TMP0]]
12178 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
12179   return vreinterpretq_s64_p16(a);
12180 }
12181
12182 // CHECK-LABEL: @test_vreinterpretq_u8_s8(
12183 // CHECK:   ret <16 x i8> %a
12184 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
12185   return vreinterpretq_u8_s8(a);
12186 }
12187
12188 // CHECK-LABEL: @test_vreinterpretq_u8_s16(
12189 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12190 // CHECK:   ret <16 x i8> [[TMP0]]
12191 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
12192   return vreinterpretq_u8_s16(a);
12193 }
12194
12195 // CHECK-LABEL: @test_vreinterpretq_u8_s32(
12196 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12197 // CHECK:   ret <16 x i8> [[TMP0]]
12198 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
12199   return vreinterpretq_u8_s32(a);
12200 }
12201
12202 // CHECK-LABEL: @test_vreinterpretq_u8_s64(
12203 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12204 // CHECK:   ret <16 x i8> [[TMP0]]
12205 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
12206   return vreinterpretq_u8_s64(a);
12207 }
12208
12209 // CHECK-LABEL: @test_vreinterpretq_u8_u16(
12210 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12211 // CHECK:   ret <16 x i8> [[TMP0]]
12212 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
12213   return vreinterpretq_u8_u16(a);
12214 }
12215
12216 // CHECK-LABEL: @test_vreinterpretq_u8_u32(
12217 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12218 // CHECK:   ret <16 x i8> [[TMP0]]
12219 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
12220   return vreinterpretq_u8_u32(a);
12221 }
12222
12223 // CHECK-LABEL: @test_vreinterpretq_u8_u64(
12224 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12225 // CHECK:   ret <16 x i8> [[TMP0]]
12226 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
12227   return vreinterpretq_u8_u64(a);
12228 }
12229
12230 // CHECK-LABEL: @test_vreinterpretq_u8_f16(
12231 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
12232 // CHECK:   ret <16 x i8> [[TMP0]]
12233 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
12234   return vreinterpretq_u8_f16(a);
12235 }
12236
12237 // CHECK-LABEL: @test_vreinterpretq_u8_f32(
12238 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
12239 // CHECK:   ret <16 x i8> [[TMP0]]
12240 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
12241   return vreinterpretq_u8_f32(a);
12242 }
12243
12244 // CHECK-LABEL: @test_vreinterpretq_u8_p8(
12245 // CHECK:   ret <16 x i8> %a
12246 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
12247   return vreinterpretq_u8_p8(a);
12248 }
12249
12250 // CHECK-LABEL: @test_vreinterpretq_u8_p16(
12251 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12252 // CHECK:   ret <16 x i8> [[TMP0]]
12253 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
12254   return vreinterpretq_u8_p16(a);
12255 }
12256
12257 // CHECK-LABEL: @test_vreinterpretq_u16_s8(
12258 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12259 // CHECK:   ret <8 x i16> [[TMP0]]
12260 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
12261   return vreinterpretq_u16_s8(a);
12262 }
12263
12264 // CHECK-LABEL: @test_vreinterpretq_u16_s16(
12265 // CHECK:   ret <8 x i16> %a
12266 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
12267   return vreinterpretq_u16_s16(a);
12268 }
12269
12270 // CHECK-LABEL: @test_vreinterpretq_u16_s32(
12271 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12272 // CHECK:   ret <8 x i16> [[TMP0]]
12273 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
12274   return vreinterpretq_u16_s32(a);
12275 }
12276
12277 // CHECK-LABEL: @test_vreinterpretq_u16_s64(
12278 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12279 // CHECK:   ret <8 x i16> [[TMP0]]
12280 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
12281   return vreinterpretq_u16_s64(a);
12282 }
12283
12284 // CHECK-LABEL: @test_vreinterpretq_u16_u8(
12285 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12286 // CHECK:   ret <8 x i16> [[TMP0]]
12287 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
12288   return vreinterpretq_u16_u8(a);
12289 }
12290
12291 // CHECK-LABEL: @test_vreinterpretq_u16_u32(
12292 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12293 // CHECK:   ret <8 x i16> [[TMP0]]
12294 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
12295   return vreinterpretq_u16_u32(a);
12296 }
12297
12298 // CHECK-LABEL: @test_vreinterpretq_u16_u64(
12299 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12300 // CHECK:   ret <8 x i16> [[TMP0]]
12301 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
12302   return vreinterpretq_u16_u64(a);
12303 }
12304
12305 // CHECK-LABEL: @test_vreinterpretq_u16_f16(
12306 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
12307 // CHECK:   ret <8 x i16> [[TMP0]]
12308 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
12309   return vreinterpretq_u16_f16(a);
12310 }
12311
12312 // CHECK-LABEL: @test_vreinterpretq_u16_f32(
12313 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
12314 // CHECK:   ret <8 x i16> [[TMP0]]
12315 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
12316   return vreinterpretq_u16_f32(a);
12317 }
12318
12319 // CHECK-LABEL: @test_vreinterpretq_u16_p8(
12320 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12321 // CHECK:   ret <8 x i16> [[TMP0]]
12322 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
12323   return vreinterpretq_u16_p8(a);
12324 }
12325
12326 // CHECK-LABEL: @test_vreinterpretq_u16_p16(
12327 // CHECK:   ret <8 x i16> %a
12328 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
12329   return vreinterpretq_u16_p16(a);
12330 }
12331
12332 // CHECK-LABEL: @test_vreinterpretq_u32_s8(
12333 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12334 // CHECK:   ret <4 x i32> [[TMP0]]
12335 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
12336   return vreinterpretq_u32_s8(a);
12337 }
12338
12339 // CHECK-LABEL: @test_vreinterpretq_u32_s16(
12340 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12341 // CHECK:   ret <4 x i32> [[TMP0]]
12342 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
12343   return vreinterpretq_u32_s16(a);
12344 }
12345
12346 // CHECK-LABEL: @test_vreinterpretq_u32_s32(
12347 // CHECK:   ret <4 x i32> %a
12348 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
12349   return vreinterpretq_u32_s32(a);
12350 }
12351
12352 // CHECK-LABEL: @test_vreinterpretq_u32_s64(
12353 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12354 // CHECK:   ret <4 x i32> [[TMP0]]
12355 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
12356   return vreinterpretq_u32_s64(a);
12357 }
12358
12359 // CHECK-LABEL: @test_vreinterpretq_u32_u8(
12360 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12361 // CHECK:   ret <4 x i32> [[TMP0]]
12362 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
12363   return vreinterpretq_u32_u8(a);
12364 }
12365
12366 // CHECK-LABEL: @test_vreinterpretq_u32_u16(
12367 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12368 // CHECK:   ret <4 x i32> [[TMP0]]
12369 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
12370   return vreinterpretq_u32_u16(a);
12371 }
12372
12373 // CHECK-LABEL: @test_vreinterpretq_u32_u64(
12374 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12375 // CHECK:   ret <4 x i32> [[TMP0]]
12376 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
12377   return vreinterpretq_u32_u64(a);
12378 }
12379
12380 // CHECK-LABEL: @test_vreinterpretq_u32_f16(
12381 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
12382 // CHECK:   ret <4 x i32> [[TMP0]]
12383 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
12384   return vreinterpretq_u32_f16(a);
12385 }
12386
12387 // CHECK-LABEL: @test_vreinterpretq_u32_f32(
12388 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
12389 // CHECK:   ret <4 x i32> [[TMP0]]
12390 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
12391   return vreinterpretq_u32_f32(a);
12392 }
12393
12394 // CHECK-LABEL: @test_vreinterpretq_u32_p8(
12395 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12396 // CHECK:   ret <4 x i32> [[TMP0]]
12397 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
12398   return vreinterpretq_u32_p8(a);
12399 }
12400
12401 // CHECK-LABEL: @test_vreinterpretq_u32_p16(
12402 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12403 // CHECK:   ret <4 x i32> [[TMP0]]
12404 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
12405   return vreinterpretq_u32_p16(a);
12406 }
12407
12408 // CHECK-LABEL: @test_vreinterpretq_u64_s8(
12409 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12410 // CHECK:   ret <2 x i64> [[TMP0]]
12411 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
12412   return vreinterpretq_u64_s8(a);
12413 }
12414
12415 // CHECK-LABEL: @test_vreinterpretq_u64_s16(
12416 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12417 // CHECK:   ret <2 x i64> [[TMP0]]
12418 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
12419   return vreinterpretq_u64_s16(a);
12420 }
12421
12422 // CHECK-LABEL: @test_vreinterpretq_u64_s32(
12423 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12424 // CHECK:   ret <2 x i64> [[TMP0]]
12425 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
12426   return vreinterpretq_u64_s32(a);
12427 }
12428
12429 // CHECK-LABEL: @test_vreinterpretq_u64_s64(
12430 // CHECK:   ret <2 x i64> %a
12431 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
12432   return vreinterpretq_u64_s64(a);
12433 }
12434
12435 // CHECK-LABEL: @test_vreinterpretq_u64_u8(
12436 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12437 // CHECK:   ret <2 x i64> [[TMP0]]
12438 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
12439   return vreinterpretq_u64_u8(a);
12440 }
12441
12442 // CHECK-LABEL: @test_vreinterpretq_u64_u16(
12443 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12444 // CHECK:   ret <2 x i64> [[TMP0]]
12445 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
12446   return vreinterpretq_u64_u16(a);
12447 }
12448
12449 // CHECK-LABEL: @test_vreinterpretq_u64_u32(
12450 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12451 // CHECK:   ret <2 x i64> [[TMP0]]
12452 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
12453   return vreinterpretq_u64_u32(a);
12454 }
12455
12456 // CHECK-LABEL: @test_vreinterpretq_u64_f16(
12457 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
12458 // CHECK:   ret <2 x i64> [[TMP0]]
12459 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
12460   return vreinterpretq_u64_f16(a);
12461 }
12462
12463 // CHECK-LABEL: @test_vreinterpretq_u64_f32(
12464 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
12465 // CHECK:   ret <2 x i64> [[TMP0]]
12466 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
12467   return vreinterpretq_u64_f32(a);
12468 }
12469
12470 // CHECK-LABEL: @test_vreinterpretq_u64_p8(
12471 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12472 // CHECK:   ret <2 x i64> [[TMP0]]
12473 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
12474   return vreinterpretq_u64_p8(a);
12475 }
12476
12477 // CHECK-LABEL: @test_vreinterpretq_u64_p16(
12478 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12479 // CHECK:   ret <2 x i64> [[TMP0]]
12480 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
12481   return vreinterpretq_u64_p16(a);
12482 }
12483
12484 // CHECK-LABEL: @test_vreinterpretq_f16_s8(
12485 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12486 // CHECK:   ret <8 x half> [[TMP0]]
12487 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
12488   return vreinterpretq_f16_s8(a);
12489 }
12490
12491 // CHECK-LABEL: @test_vreinterpretq_f16_s16(
12492 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12493 // CHECK:   ret <8 x half> [[TMP0]]
12494 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
12495   return vreinterpretq_f16_s16(a);
12496 }
12497
12498 // CHECK-LABEL: @test_vreinterpretq_f16_s32(
12499 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
12500 // CHECK:   ret <8 x half> [[TMP0]]
12501 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
12502   return vreinterpretq_f16_s32(a);
12503 }
12504
12505 // CHECK-LABEL: @test_vreinterpretq_f16_s64(
12506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
12507 // CHECK:   ret <8 x half> [[TMP0]]
12508 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
12509   return vreinterpretq_f16_s64(a);
12510 }
12511
12512 // CHECK-LABEL: @test_vreinterpretq_f16_u8(
12513 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12514 // CHECK:   ret <8 x half> [[TMP0]]
12515 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
12516   return vreinterpretq_f16_u8(a);
12517 }
12518
12519 // CHECK-LABEL: @test_vreinterpretq_f16_u16(
12520 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12521 // CHECK:   ret <8 x half> [[TMP0]]
12522 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
12523   return vreinterpretq_f16_u16(a);
12524 }
12525
12526 // CHECK-LABEL: @test_vreinterpretq_f16_u32(
12527 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
12528 // CHECK:   ret <8 x half> [[TMP0]]
12529 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
12530   return vreinterpretq_f16_u32(a);
12531 }
12532
12533 // CHECK-LABEL: @test_vreinterpretq_f16_u64(
12534 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
12535 // CHECK:   ret <8 x half> [[TMP0]]
12536 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
12537   return vreinterpretq_f16_u64(a);
12538 }
12539
12540 // CHECK-LABEL: @test_vreinterpretq_f16_f32(
12541 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
12542 // CHECK:   ret <8 x half> [[TMP0]]
12543 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
12544   return vreinterpretq_f16_f32(a);
12545 }
12546
12547 // CHECK-LABEL: @test_vreinterpretq_f16_p8(
12548 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12549 // CHECK:   ret <8 x half> [[TMP0]]
12550 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
12551   return vreinterpretq_f16_p8(a);
12552 }
12553
12554 // CHECK-LABEL: @test_vreinterpretq_f16_p16(
12555 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12556 // CHECK:   ret <8 x half> [[TMP0]]
12557 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
12558   return vreinterpretq_f16_p16(a);
12559 }
12560
12561 // CHECK-LABEL: @test_vreinterpretq_f32_s8(
12562 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12563 // CHECK:   ret <4 x float> [[TMP0]]
12564 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
12565   return vreinterpretq_f32_s8(a);
12566 }
12567
12568 // CHECK-LABEL: @test_vreinterpretq_f32_s16(
12569 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12570 // CHECK:   ret <4 x float> [[TMP0]]
12571 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
12572   return vreinterpretq_f32_s16(a);
12573 }
12574
12575 // CHECK-LABEL: @test_vreinterpretq_f32_s32(
12576 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
12577 // CHECK:   ret <4 x float> [[TMP0]]
12578 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
12579   return vreinterpretq_f32_s32(a);
12580 }
12581
12582 // CHECK-LABEL: @test_vreinterpretq_f32_s64(
12583 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
12584 // CHECK:   ret <4 x float> [[TMP0]]
12585 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
12586   return vreinterpretq_f32_s64(a);
12587 }
12588
12589 // CHECK-LABEL: @test_vreinterpretq_f32_u8(
12590 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12591 // CHECK:   ret <4 x float> [[TMP0]]
12592 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
12593   return vreinterpretq_f32_u8(a);
12594 }
12595
12596 // CHECK-LABEL: @test_vreinterpretq_f32_u16(
12597 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12598 // CHECK:   ret <4 x float> [[TMP0]]
12599 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
12600   return vreinterpretq_f32_u16(a);
12601 }
12602
12603 // CHECK-LABEL: @test_vreinterpretq_f32_u32(
12604 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
12605 // CHECK:   ret <4 x float> [[TMP0]]
12606 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
12607   return vreinterpretq_f32_u32(a);
12608 }
12609
12610 // CHECK-LABEL: @test_vreinterpretq_f32_u64(
12611 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
12612 // CHECK:   ret <4 x float> [[TMP0]]
12613 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
12614   return vreinterpretq_f32_u64(a);
12615 }
12616
12617 // CHECK-LABEL: @test_vreinterpretq_f32_f16(
12618 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
12619 // CHECK:   ret <4 x float> [[TMP0]]
12620 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
12621   return vreinterpretq_f32_f16(a);
12622 }
12623
12624 // CHECK-LABEL: @test_vreinterpretq_f32_p8(
12625 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12626 // CHECK:   ret <4 x float> [[TMP0]]
12627 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
12628   return vreinterpretq_f32_p8(a);
12629 }
12630
12631 // CHECK-LABEL: @test_vreinterpretq_f32_p16(
12632 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12633 // CHECK:   ret <4 x float> [[TMP0]]
12634 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
12635   return vreinterpretq_f32_p16(a);
12636 }
12637
12638 // CHECK-LABEL: @test_vreinterpretq_p8_s8(
12639 // CHECK:   ret <16 x i8> %a
12640 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
12641   return vreinterpretq_p8_s8(a);
12642 }
12643
12644 // CHECK-LABEL: @test_vreinterpretq_p8_s16(
12645 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12646 // CHECK:   ret <16 x i8> [[TMP0]]
12647 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
12648   return vreinterpretq_p8_s16(a);
12649 }
12650
12651 // CHECK-LABEL: @test_vreinterpretq_p8_s32(
12652 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12653 // CHECK:   ret <16 x i8> [[TMP0]]
12654 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
12655   return vreinterpretq_p8_s32(a);
12656 }
12657
12658 // CHECK-LABEL: @test_vreinterpretq_p8_s64(
12659 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12660 // CHECK:   ret <16 x i8> [[TMP0]]
12661 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
12662   return vreinterpretq_p8_s64(a);
12663 }
12664
12665 // CHECK-LABEL: @test_vreinterpretq_p8_u8(
12666 // CHECK:   ret <16 x i8> %a
12667 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
12668   return vreinterpretq_p8_u8(a);
12669 }
12670
12671 // CHECK-LABEL: @test_vreinterpretq_p8_u16(
12672 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12673 // CHECK:   ret <16 x i8> [[TMP0]]
12674 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
12675   return vreinterpretq_p8_u16(a);
12676 }
12677
12678 // CHECK-LABEL: @test_vreinterpretq_p8_u32(
12679 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12680 // CHECK:   ret <16 x i8> [[TMP0]]
12681 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
12682   return vreinterpretq_p8_u32(a);
12683 }
12684
12685 // CHECK-LABEL: @test_vreinterpretq_p8_u64(
12686 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12687 // CHECK:   ret <16 x i8> [[TMP0]]
12688 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
12689   return vreinterpretq_p8_u64(a);
12690 }
12691
12692 // CHECK-LABEL: @test_vreinterpretq_p8_f16(
12693 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
12694 // CHECK:   ret <16 x i8> [[TMP0]]
12695 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
12696   return vreinterpretq_p8_f16(a);
12697 }
12698
12699 // CHECK-LABEL: @test_vreinterpretq_p8_f32(
12700 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
12701 // CHECK:   ret <16 x i8> [[TMP0]]
12702 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
12703   return vreinterpretq_p8_f32(a);
12704 }
12705
12706 // CHECK-LABEL: @test_vreinterpretq_p8_p16(
12707 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12708 // CHECK:   ret <16 x i8> [[TMP0]]
12709 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
12710   return vreinterpretq_p8_p16(a);
12711 }
12712
12713 // CHECK-LABEL: @test_vreinterpretq_p16_s8(
12714 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12715 // CHECK:   ret <8 x i16> [[TMP0]]
12716 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
12717   return vreinterpretq_p16_s8(a);
12718 }
12719
12720 // CHECK-LABEL: @test_vreinterpretq_p16_s16(
12721 // CHECK:   ret <8 x i16> %a
12722 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
12723   return vreinterpretq_p16_s16(a);
12724 }
12725
12726 // CHECK-LABEL: @test_vreinterpretq_p16_s32(
12727 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12728 // CHECK:   ret <8 x i16> [[TMP0]]
12729 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
12730   return vreinterpretq_p16_s32(a);
12731 }
12732
12733 // CHECK-LABEL: @test_vreinterpretq_p16_s64(
12734 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12735 // CHECK:   ret <8 x i16> [[TMP0]]
12736 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
12737   return vreinterpretq_p16_s64(a);
12738 }
12739
12740 // CHECK-LABEL: @test_vreinterpretq_p16_u8(
12741 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12742 // CHECK:   ret <8 x i16> [[TMP0]]
12743 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
12744   return vreinterpretq_p16_u8(a);
12745 }
12746
12747 // CHECK-LABEL: @test_vreinterpretq_p16_u16(
12748 // CHECK:   ret <8 x i16> %a
12749 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
12750   return vreinterpretq_p16_u16(a);
12751 }
12752
12753 // CHECK-LABEL: @test_vreinterpretq_p16_u32(
12754 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12755 // CHECK:   ret <8 x i16> [[TMP0]]
12756 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
12757   return vreinterpretq_p16_u32(a);
12758 }
12759
12760 // CHECK-LABEL: @test_vreinterpretq_p16_u64(
12761 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12762 // CHECK:   ret <8 x i16> [[TMP0]]
12763 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
12764   return vreinterpretq_p16_u64(a);
12765 }
12766
12767 // CHECK-LABEL: @test_vreinterpretq_p16_f16(
12768 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
12769 // CHECK:   ret <8 x i16> [[TMP0]]
12770 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
12771   return vreinterpretq_p16_f16(a);
12772 }
12773
12774 // CHECK-LABEL: @test_vreinterpretq_p16_f32(
12775 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
12776 // CHECK:   ret <8 x i16> [[TMP0]]
12777 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
12778   return vreinterpretq_p16_f32(a);
12779 }
12780
12781 // CHECK-LABEL: @test_vreinterpretq_p16_p8(
12782 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12783 // CHECK:   ret <8 x i16> [[TMP0]]
12784 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
12785   return vreinterpretq_p16_p8(a);
12786 }
12787
12788 // CHECK-LABEL: @test_vrev16_s8(
12789 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12790 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12791 int8x8_t test_vrev16_s8(int8x8_t a) {
12792   return vrev16_s8(a);
12793 }
12794
12795 // CHECK-LABEL: @test_vrev16_u8(
12796 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12797 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12798 uint8x8_t test_vrev16_u8(uint8x8_t a) {
12799   return vrev16_u8(a);
12800 }
12801
12802 // CHECK-LABEL: @test_vrev16_p8(
12803 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12804 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12805 poly8x8_t test_vrev16_p8(poly8x8_t a) {
12806   return vrev16_p8(a);
12807 }
12808
12809 // CHECK-LABEL: @test_vrev16q_s8(
12810 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
12811 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12812 int8x16_t test_vrev16q_s8(int8x16_t a) {
12813   return vrev16q_s8(a);
12814 }
12815
12816 // CHECK-LABEL: @test_vrev16q_u8(
12817 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
12818 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12819 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
12820   return vrev16q_u8(a);
12821 }
12822
12823 // CHECK-LABEL: @test_vrev16q_p8(
12824 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
12825 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12826 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
12827   return vrev16q_p8(a);
12828 }
12829
12830 // CHECK-LABEL: @test_vrev32_s8(
12831 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
12832 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12833 int8x8_t test_vrev32_s8(int8x8_t a) {
12834   return vrev32_s8(a);
12835 }
12836
12837 // CHECK-LABEL: @test_vrev32_s16(
12838 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
12839 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12840 int16x4_t test_vrev32_s16(int16x4_t a) {
12841   return vrev32_s16(a);
12842 }
12843
12844 // CHECK-LABEL: @test_vrev32_u8(
12845 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
12846 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12847 uint8x8_t test_vrev32_u8(uint8x8_t a) {
12848   return vrev32_u8(a);
12849 }
12850
12851 // CHECK-LABEL: @test_vrev32_u16(
12852 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
12853 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12854 uint16x4_t test_vrev32_u16(uint16x4_t a) {
12855   return vrev32_u16(a);
12856 }
12857
12858 // CHECK-LABEL: @test_vrev32_p8(
12859 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
12860 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12861 poly8x8_t test_vrev32_p8(poly8x8_t a) {
12862   return vrev32_p8(a);
12863 }
12864
12865 // CHECK-LABEL: @test_vrev32_p16(
12866 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
12867 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12868 poly16x4_t test_vrev32_p16(poly16x4_t a) {
12869   return vrev32_p16(a);
12870 }
12871
12872 // CHECK-LABEL: @test_vrev32q_s8(
12873 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
12874 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12875 int8x16_t test_vrev32q_s8(int8x16_t a) {
12876   return vrev32q_s8(a);
12877 }
12878
12879 // CHECK-LABEL: @test_vrev32q_s16(
12880 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12881 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
12882 int16x8_t test_vrev32q_s16(int16x8_t a) {
12883   return vrev32q_s16(a);
12884 }
12885
12886 // CHECK-LABEL: @test_vrev32q_u8(
12887 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
12888 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12889 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
12890   return vrev32q_u8(a);
12891 }
12892
12893 // CHECK-LABEL: @test_vrev32q_u16(
12894 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12895 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
12896 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
12897   return vrev32q_u16(a);
12898 }
12899
12900 // CHECK-LABEL: @test_vrev32q_p8(
12901 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
12902 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12903 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
12904   return vrev32q_p8(a);
12905 }
12906
12907 // CHECK-LABEL: @test_vrev32q_p16(
12908 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
12909 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
12910 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
12911   return vrev32q_p16(a);
12912 }
12913
12914 // CHECK-LABEL: @test_vrev64_s8(
12915 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
12916 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12917 int8x8_t test_vrev64_s8(int8x8_t a) {
12918   return vrev64_s8(a);
12919 }
12920
12921 // CHECK-LABEL: @test_vrev64_s16(
12922 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
12923 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12924 int16x4_t test_vrev64_s16(int16x4_t a) {
12925   return vrev64_s16(a);
12926 }
12927
12928 // CHECK-LABEL: @test_vrev64_s32(
12929 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
12930 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
12931 int32x2_t test_vrev64_s32(int32x2_t a) {
12932   return vrev64_s32(a);
12933 }
12934
12935 // CHECK-LABEL: @test_vrev64_u8(
12936 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
12937 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12938 uint8x8_t test_vrev64_u8(uint8x8_t a) {
12939   return vrev64_u8(a);
12940 }
12941
12942 // CHECK-LABEL: @test_vrev64_u16(
12943 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
12944 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12945 uint16x4_t test_vrev64_u16(uint16x4_t a) {
12946   return vrev64_u16(a);
12947 }
12948
12949 // CHECK-LABEL: @test_vrev64_u32(
12950 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
12951 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
12952 uint32x2_t test_vrev64_u32(uint32x2_t a) {
12953   return vrev64_u32(a);
12954 }
12955
12956 // CHECK-LABEL: @test_vrev64_p8(
12957 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
12958 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
12959 poly8x8_t test_vrev64_p8(poly8x8_t a) {
12960   return vrev64_p8(a);
12961 }
12962
12963 // CHECK-LABEL: @test_vrev64_p16(
12964 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
12965 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
12966 poly16x4_t test_vrev64_p16(poly16x4_t a) {
12967   return vrev64_p16(a);
12968 }
12969
12970 // CHECK-LABEL: @test_vrev64_f32(
12971 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
12972 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
12973 float32x2_t test_vrev64_f32(float32x2_t a) {
12974   return vrev64_f32(a);
12975 }
12976
12977 // CHECK-LABEL: @test_vrev64q_s8(
12978 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
12979 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
12980 int8x16_t test_vrev64q_s8(int8x16_t a) {
12981   return vrev64q_s8(a);
12982 }
12983
12984 // CHECK-LABEL: @test_vrev64q_s16(
12985 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
12986 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
12987 int16x8_t test_vrev64q_s16(int16x8_t a) {
12988   return vrev64q_s16(a);
12989 }
12990
12991 // CHECK-LABEL: @test_vrev64q_s32(
12992 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
12993 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
12994 int32x4_t test_vrev64q_s32(int32x4_t a) {
12995   return vrev64q_s32(a);
12996 }
12997
12998 // CHECK-LABEL: @test_vrev64q_u8(
12999 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
13000 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
13001 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
13002   return vrev64q_u8(a);
13003 }
13004
13005 // CHECK-LABEL: @test_vrev64q_u16(
13006 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13007 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
13008 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
13009   return vrev64q_u16(a);
13010 }
13011
13012 // CHECK-LABEL: @test_vrev64q_u32(
13013 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13014 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
13015 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
13016   return vrev64q_u32(a);
13017 }
13018
13019 // CHECK-LABEL: @test_vrev64q_p8(
13020 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
13021 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
13022 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
13023   return vrev64q_p8(a);
13024 }
13025
13026 // CHECK-LABEL: @test_vrev64q_p16(
13027 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13028 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
13029 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
13030   return vrev64q_p16(a);
13031 }
13032
13033 // CHECK-LABEL: @test_vrev64q_f32(
13034 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13035 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
13036 float32x4_t test_vrev64q_f32(float32x4_t a) {
13037   return vrev64q_f32(a);
13038 }
13039
13040 // CHECK-LABEL: @test_vrhadd_s8(
13041 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
13042 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
13043 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
13044   return vrhadd_s8(a, b);
13045 }
13046
13047 // CHECK-LABEL: @test_vrhadd_s16(
13048 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13049 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13050 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
13051 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
13052 // CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
13053 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
13054   return vrhadd_s16(a, b);
13055 }
13056
13057 // CHECK-LABEL: @test_vrhadd_s32(
13058 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13059 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13060 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
13061 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
13062 // CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
13063 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
13064   return vrhadd_s32(a, b);
13065 }
13066
13067 // CHECK-LABEL: @test_vrhadd_u8(
13068 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
13069 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
13070 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
13071   return vrhadd_u8(a, b);
13072 }
13073
13074 // CHECK-LABEL: @test_vrhadd_u16(
13075 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13076 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13077 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
13078 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
13079 // CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
13080 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
13081   return vrhadd_u16(a, b);
13082 }
13083
13084 // CHECK-LABEL: @test_vrhadd_u32(
13085 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13086 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13087 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
13088 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
13089 // CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
13090 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
13091   return vrhadd_u32(a, b);
13092 }
13093
13094 // CHECK-LABEL: @test_vrhaddq_s8(
13095 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
13096 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
13097 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
13098   return vrhaddq_s8(a, b);
13099 }
13100
13101 // CHECK-LABEL: @test_vrhaddq_s16(
13102 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13103 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13104 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
13105 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
13106 // CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
13107 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
13108   return vrhaddq_s16(a, b);
13109 }
13110
13111 // CHECK-LABEL: @test_vrhaddq_s32(
13112 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13113 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13114 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
13115 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
13116 // CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
13117 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
13118   return vrhaddq_s32(a, b);
13119 }
13120
13121 // CHECK-LABEL: @test_vrhaddq_u8(
13122 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
13123 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
13124 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
13125   return vrhaddq_u8(a, b);
13126 }
13127
13128 // CHECK-LABEL: @test_vrhaddq_u16(
13129 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13130 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13131 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
13132 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
13133 // CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
13134 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
13135   return vrhaddq_u16(a, b);
13136 }
13137
13138 // CHECK-LABEL: @test_vrhaddq_u32(
13139 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13140 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13141 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
13142 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
13143 // CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
13144 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
13145   return vrhaddq_u32(a, b);
13146 }
13147
13148 // CHECK-LABEL: @test_vrshl_s8(
13149 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
13150 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
13151 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
13152   return vrshl_s8(a, b);
13153 }
13154
13155 // CHECK-LABEL: @test_vrshl_s16(
13156 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13157 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13158 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
13159 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
13160 // CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
13161 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
13162   return vrshl_s16(a, b);
13163 }
13164
13165 // CHECK-LABEL: @test_vrshl_s32(
13166 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13167 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13168 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
13169 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
13170 // CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
13171 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
13172   return vrshl_s32(a, b);
13173 }
13174
13175 // CHECK-LABEL: @test_vrshl_s64(
13176 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13177 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13178 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
13179 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
13180 // CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
13181 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
13182   return vrshl_s64(a, b);
13183 }
13184
13185 // CHECK-LABEL: @test_vrshl_u8(
13186 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
13187 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
13188 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
13189   return vrshl_u8(a, b);
13190 }
13191
13192 // CHECK-LABEL: @test_vrshl_u16(
13193 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13194 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13195 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
13196 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
13197 // CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
13198 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
13199   return vrshl_u16(a, b);
13200 }
13201
13202 // CHECK-LABEL: @test_vrshl_u32(
13203 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13204 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13205 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
13206 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
13207 // CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
13208 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
13209   return vrshl_u32(a, b);
13210 }
13211
13212 // CHECK-LABEL: @test_vrshl_u64(
13213 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13214 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13215 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
13216 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
13217 // CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
13218 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
13219   return vrshl_u64(a, b);
13220 }
13221
13222 // CHECK-LABEL: @test_vrshlq_s8(
13223 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
13224 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
13225 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
13226   return vrshlq_s8(a, b);
13227 }
13228
13229 // CHECK-LABEL: @test_vrshlq_s16(
13230 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13231 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13232 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
13233 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
13234 // CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
13235 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
13236   return vrshlq_s16(a, b);
13237 }
13238
13239 // CHECK-LABEL: @test_vrshlq_s32(
13240 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13241 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13242 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
13243 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
13244 // CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
13245 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
13246   return vrshlq_s32(a, b);
13247 }
13248
13249 // CHECK-LABEL: @test_vrshlq_s64(
13250 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13251 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13252 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
13253 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
13254 // CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
13255 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
13256   return vrshlq_s64(a, b);
13257 }
13258
13259 // CHECK-LABEL: @test_vrshlq_u8(
13260 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
13261 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
13262 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
13263   return vrshlq_u8(a, b);
13264 }
13265
13266 // CHECK-LABEL: @test_vrshlq_u16(
13267 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13268 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13269 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
13270 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
13271 // CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
13272 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
13273   return vrshlq_u16(a, b);
13274 }
13275
13276 // CHECK-LABEL: @test_vrshlq_u32(
13277 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13278 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13279 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
13280 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
13281 // CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
13282 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
13283   return vrshlq_u32(a, b);
13284 }
13285
13286 // CHECK-LABEL: @test_vrshlq_u64(
13287 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13288 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13289 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
13290 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
13291 // CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
13292 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
13293   return vrshlq_u64(a, b);
13294 }
13295
13296 // CHECK-LABEL: @test_vrshrn_n_s16(
13297 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13298 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13299 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13300 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
13301 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
13302   return vrshrn_n_s16(a, 1);
13303 }
13304
13305 // CHECK-LABEL: @test_vrshrn_n_s32(
13306 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13307 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13308 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13309 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
13310 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
13311   return vrshrn_n_s32(a, 1);
13312 }
13313
13314 // CHECK-LABEL: @test_vrshrn_n_s64(
13315 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13316 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13317 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13318 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
13319 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
13320   return vrshrn_n_s64(a, 1);
13321 }
13322
13323 // CHECK-LABEL: @test_vrshrn_n_u16(
13324 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13325 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13326 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13327 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
13328 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
13329   return vrshrn_n_u16(a, 1);
13330 }
13331
13332 // CHECK-LABEL: @test_vrshrn_n_u32(
13333 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13334 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13335 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13336 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
13337 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
13338   return vrshrn_n_u32(a, 1);
13339 }
13340
13341 // CHECK-LABEL: @test_vrshrn_n_u64(
13342 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13343 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13344 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13345 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
13346 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
13347   return vrshrn_n_u64(a, 1);
13348 }
13349
13350 // CHECK-LABEL: @test_vrshr_n_s8(
13351 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13352 // CHECK:   ret <8 x i8> [[VRSHR_N]]
13353 int8x8_t test_vrshr_n_s8(int8x8_t a) {
13354   return vrshr_n_s8(a, 1);
13355 }
13356
13357 // CHECK-LABEL: @test_vrshr_n_s16(
13358 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13359 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13360 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13361 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
13362 int16x4_t test_vrshr_n_s16(int16x4_t a) {
13363   return vrshr_n_s16(a, 1);
13364 }
13365
13366 // CHECK-LABEL: @test_vrshr_n_s32(
13367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13368 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13369 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
13370 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
13371 int32x2_t test_vrshr_n_s32(int32x2_t a) {
13372   return vrshr_n_s32(a, 1);
13373 }
13374
13375 // CHECK-LABEL: @test_vrshr_n_s64(
13376 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13377 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13378 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
13379 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
13380 int64x1_t test_vrshr_n_s64(int64x1_t a) {
13381   return vrshr_n_s64(a, 1);
13382 }
13383
13384 // CHECK-LABEL: @test_vrshr_n_u8(
13385 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13386 // CHECK:   ret <8 x i8> [[VRSHR_N]]
13387 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
13388   return vrshr_n_u8(a, 1);
13389 }
13390
13391 // CHECK-LABEL: @test_vrshr_n_u16(
13392 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13393 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13394 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13395 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
13396 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
13397   return vrshr_n_u16(a, 1);
13398 }
13399
13400 // CHECK-LABEL: @test_vrshr_n_u32(
13401 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13402 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13403 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
13404 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
13405 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
13406   return vrshr_n_u32(a, 1);
13407 }
13408
13409 // CHECK-LABEL: @test_vrshr_n_u64(
13410 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13411 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13412 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
13413 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
13414 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
13415   return vrshr_n_u64(a, 1);
13416 }
13417
13418 // CHECK-LABEL: @test_vrshrq_n_s8(
13419 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13420 // CHECK:   ret <16 x i8> [[VRSHR_N]]
13421 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
13422   return vrshrq_n_s8(a, 1);
13423 }
13424
13425 // CHECK-LABEL: @test_vrshrq_n_s16(
13426 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13427 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13428 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13429 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
13430 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
13431   return vrshrq_n_s16(a, 1);
13432 }
13433
13434 // CHECK-LABEL: @test_vrshrq_n_s32(
13435 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13436 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13437 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13438 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
13439 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
13440   return vrshrq_n_s32(a, 1);
13441 }
13442
13443 // CHECK-LABEL: @test_vrshrq_n_s64(
13444 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13445 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13446 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
13447 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
13448 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
13449   return vrshrq_n_s64(a, 1);
13450 }
13451
13452 // CHECK-LABEL: @test_vrshrq_n_u8(
13453 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13454 // CHECK:   ret <16 x i8> [[VRSHR_N]]
13455 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
13456   return vrshrq_n_u8(a, 1);
13457 }
13458
13459 // CHECK-LABEL: @test_vrshrq_n_u16(
13460 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13461 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13462 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13463 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
13464 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
13465   return vrshrq_n_u16(a, 1);
13466 }
13467
13468 // CHECK-LABEL: @test_vrshrq_n_u32(
13469 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13470 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13471 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13472 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
13473 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
13474   return vrshrq_n_u32(a, 1);
13475 }
13476
13477 // CHECK-LABEL: @test_vrshrq_n_u64(
13478 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13479 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13480 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
13481 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
13482 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
13483   return vrshrq_n_u64(a, 1);
13484 }
13485
13486 // CHECK-LABEL: @test_vrsqrte_f32(
13487 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13488 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a)
13489 // CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
13490 float32x2_t test_vrsqrte_f32(float32x2_t a) {
13491   return vrsqrte_f32(a);
13492 }
13493
13494 // CHECK-LABEL: @test_vrsqrte_u32(
13495 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13496 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
13497 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
13498 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
13499   return vrsqrte_u32(a);
13500 }
13501
13502 // CHECK-LABEL: @test_vrsqrteq_f32(
13503 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13504 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a)
13505 // CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
13506 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
13507   return vrsqrteq_f32(a);
13508 }
13509
13510 // CHECK-LABEL: @test_vrsqrteq_u32(
13511 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13512 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
13513 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
13514 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
13515   return vrsqrteq_u32(a);
13516 }
13517
13518 // CHECK-LABEL: @test_vrsqrts_f32(
13519 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13520 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
13521 // CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b)
13522 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
13523 // CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
13524 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
13525   return vrsqrts_f32(a, b);
13526 }
13527
13528 // CHECK-LABEL: @test_vrsqrtsq_f32(
13529 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13530 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
13531 // CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b)
13532 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
13533 // CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
13534 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
13535   return vrsqrtsq_f32(a, b);
13536 }
13537
13538 // CHECK-LABEL: @test_vrsra_n_s8(
13539 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13540 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
13541 // CHECK:   ret <8 x i8> [[VRSRA_N]]
13542 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
13543   return vrsra_n_s8(a, b, 1);
13544 }
13545
13546 // CHECK-LABEL: @test_vrsra_n_s16(
13547 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13548 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13549 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13550 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13551 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13552 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
13553 // CHECK:   ret <4 x i16> [[VRSRA_N]]
13554 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
13555   return vrsra_n_s16(a, b, 1);
13556 }
13557
13558 // CHECK-LABEL: @test_vrsra_n_s32(
13559 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13560 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13561 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13562 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13563 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
13564 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
13565 // CHECK:   ret <2 x i32> [[VRSRA_N]]
13566 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
13567   return vrsra_n_s32(a, b, 1);
13568 }
13569
13570 // CHECK-LABEL: @test_vrsra_n_s64(
13571 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13572 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13573 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13574 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13575 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
13576 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
13577 // CHECK:   ret <1 x i64> [[VRSRA_N]]
13578 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
13579   return vrsra_n_s64(a, b, 1);
13580 }
13581
13582 // CHECK-LABEL: @test_vrsra_n_u8(
13583 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13584 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
13585 // CHECK:   ret <8 x i8> [[VRSRA_N]]
13586 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
13587   return vrsra_n_u8(a, b, 1);
13588 }
13589
13590 // CHECK-LABEL: @test_vrsra_n_u16(
13591 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13592 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13593 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13594 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13595 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13596 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
13597 // CHECK:   ret <4 x i16> [[VRSRA_N]]
13598 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
13599   return vrsra_n_u16(a, b, 1);
13600 }
13601
13602 // CHECK-LABEL: @test_vrsra_n_u32(
13603 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13604 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13605 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13606 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13607 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
13608 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
13609 // CHECK:   ret <2 x i32> [[VRSRA_N]]
13610 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
13611   return vrsra_n_u32(a, b, 1);
13612 }
13613
13614 // CHECK-LABEL: @test_vrsra_n_u64(
13615 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13616 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13617 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13618 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13619 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
13620 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
13621 // CHECK:   ret <1 x i64> [[VRSRA_N]]
13622 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
13623   return vrsra_n_u64(a, b, 1);
13624 }
13625
13626 // CHECK-LABEL: @test_vrsraq_n_s8(
13627 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13628 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
13629 // CHECK:   ret <16 x i8> [[VRSRA_N]]
13630 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
13631   return vrsraq_n_s8(a, b, 1);
13632 }
13633
13634 // CHECK-LABEL: @test_vrsraq_n_s16(
13635 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13636 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13637 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13638 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13639 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13640 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
13641 // CHECK:   ret <8 x i16> [[VRSRA_N]]
13642 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
13643   return vrsraq_n_s16(a, b, 1);
13644 }
13645
13646 // CHECK-LABEL: @test_vrsraq_n_s32(
13647 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13648 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13649 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13650 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13651 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13652 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
13653 // CHECK:   ret <4 x i32> [[VRSRA_N]]
13654 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
13655   return vrsraq_n_s32(a, b, 1);
13656 }
13657
13658 // CHECK-LABEL: @test_vrsraq_n_s64(
13659 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13660 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13661 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13662 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13663 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
13664 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
13665 // CHECK:   ret <2 x i64> [[VRSRA_N]]
13666 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
13667   return vrsraq_n_s64(a, b, 1);
13668 }
13669
13670 // CHECK-LABEL: @test_vrsraq_n_u8(
13671 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13672 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
13673 // CHECK:   ret <16 x i8> [[VRSRA_N]]
13674 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
13675   return vrsraq_n_u8(a, b, 1);
13676 }
13677
13678 // CHECK-LABEL: @test_vrsraq_n_u16(
13679 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13680 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13681 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13682 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13683 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13684 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
13685 // CHECK:   ret <8 x i16> [[VRSRA_N]]
13686 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
13687   return vrsraq_n_u16(a, b, 1);
13688 }
13689
13690 // CHECK-LABEL: @test_vrsraq_n_u32(
13691 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13692 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13693 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13694 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13695 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13696 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
13697 // CHECK:   ret <4 x i32> [[VRSRA_N]]
13698 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
13699   return vrsraq_n_u32(a, b, 1);
13700 }
13701
13702 // CHECK-LABEL: @test_vrsraq_n_u64(
13703 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13704 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13705 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13706 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13707 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
13708 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
13709 // CHECK:   ret <2 x i64> [[VRSRA_N]]
13710 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
13711   return vrsraq_n_u64(a, b, 1);
13712 }
13713
13714 // CHECK-LABEL: @test_vrsubhn_s16(
13715 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13716 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13717 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
13718 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
13719 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
13720   return vrsubhn_s16(a, b);
13721 }
13722
13723 // CHECK-LABEL: @test_vrsubhn_s32(
13724 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13725 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13726 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
13727 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
13728 // CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
13729 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
13730   return vrsubhn_s32(a, b);
13731 }
13732
13733 // CHECK-LABEL: @test_vrsubhn_s64(
13734 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13735 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13736 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
13737 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
13738 // CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
13739 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
13740   return vrsubhn_s64(a, b);
13741 }
13742
13743 // CHECK-LABEL: @test_vrsubhn_u16(
13744 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13745 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13746 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
13747 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
13748 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
13749   return vrsubhn_u16(a, b);
13750 }
13751
13752 // CHECK-LABEL: @test_vrsubhn_u32(
13753 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13754 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13755 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
13756 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
13757 // CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
13758 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
13759   return vrsubhn_u32(a, b);
13760 }
13761
13762 // CHECK-LABEL: @test_vrsubhn_u64(
13763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13764 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13765 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
13766 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
13767 // CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
13768 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
13769   return vrsubhn_u64(a, b);
13770 }
13771
13772 // CHECK-LABEL: @test_vset_lane_u8(
13773 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
13774 // CHECK:   ret <8 x i8> [[VSET_LANE]]
13775 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
13776   return vset_lane_u8(a, b, 7);
13777 }
13778
13779 // CHECK-LABEL: @test_vset_lane_u16(
13780 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
13781 // CHECK:   ret <4 x i16> [[VSET_LANE]]
13782 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
13783   return vset_lane_u16(a, b, 3);
13784 }
13785
13786 // CHECK-LABEL: @test_vset_lane_u32(
13787 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1
13788 // CHECK:   ret <2 x i32> [[VSET_LANE]]
13789 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
13790   return vset_lane_u32(a, b, 1);
13791 }
13792
13793 // CHECK-LABEL: @test_vset_lane_s8(
13794 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
13795 // CHECK:   ret <8 x i8> [[VSET_LANE]]
13796 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
13797   return vset_lane_s8(a, b, 7);
13798 }
13799
13800 // CHECK-LABEL: @test_vset_lane_s16(
13801 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
13802 // CHECK:   ret <4 x i16> [[VSET_LANE]]
13803 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
13804   return vset_lane_s16(a, b, 3);
13805 }
13806
13807 // CHECK-LABEL: @test_vset_lane_s32(
13808 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1
13809 // CHECK:   ret <2 x i32> [[VSET_LANE]]
13810 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
13811   return vset_lane_s32(a, b, 1);
13812 }
13813
13814 // CHECK-LABEL: @test_vset_lane_p8(
13815 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
13816 // CHECK:   ret <8 x i8> [[VSET_LANE]]
13817 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
13818   return vset_lane_p8(a, b, 7);
13819 }
13820
13821 // CHECK-LABEL: @test_vset_lane_p16(
13822 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
13823 // CHECK:   ret <4 x i16> [[VSET_LANE]]
13824 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
13825   return vset_lane_p16(a, b, 3);
13826 }
13827
13828 // CHECK-LABEL: @test_vset_lane_f32(
13829 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> %b, float %a, i32 1
13830 // CHECK:   ret <2 x float> [[VSET_LANE]]
13831 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
13832   return vset_lane_f32(a, b, 1);
13833 }
13834
13835 // CHECK-LABEL: @test_vset_lane_f16(
13836 // CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
13837 // CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
13838 // CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
13839 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
13840 // CHECK:   store half [[TMP0]], ptr [[__REINT_246]], align 2
13841 // CHECK:   store <4 x half> %b, ptr [[__REINT1_246]], align 8
13842 // CHECK:   [[TMP2:%.*]] = load i16, ptr [[__REINT_246]], align 2
13843 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT1_246]], align 8
13844 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[TMP2]], i32 1
13845 // CHECK:   store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_246]], align 8
13846 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[__REINT2_246]], align 8
13847 // CHECK:   ret <4 x half> [[TMP8]]
13848 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
13849   return vset_lane_f16(*a, b, 1);
13850 }
13851
13852 // CHECK-LABEL: @test_vsetq_lane_u8(
13853 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
13854 // CHECK:   ret <16 x i8> [[VSET_LANE]]
13855 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
13856   return vsetq_lane_u8(a, b, 15);
13857 }
13858
13859 // CHECK-LABEL: @test_vsetq_lane_u16(
13860 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
13861 // CHECK:   ret <8 x i16> [[VSET_LANE]]
13862 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
13863   return vsetq_lane_u16(a, b, 7);
13864 }
13865
13866 // CHECK-LABEL: @test_vsetq_lane_u32(
13867 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3
13868 // CHECK:   ret <4 x i32> [[VSET_LANE]]
13869 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
13870   return vsetq_lane_u32(a, b, 3);
13871 }
13872
13873 // CHECK-LABEL: @test_vsetq_lane_s8(
13874 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
13875 // CHECK:   ret <16 x i8> [[VSET_LANE]]
13876 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
13877   return vsetq_lane_s8(a, b, 15);
13878 }
13879
13880 // CHECK-LABEL: @test_vsetq_lane_s16(
13881 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
13882 // CHECK:   ret <8 x i16> [[VSET_LANE]]
13883 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
13884   return vsetq_lane_s16(a, b, 7);
13885 }
13886
13887 // CHECK-LABEL: @test_vsetq_lane_s32(
13888 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3
13889 // CHECK:   ret <4 x i32> [[VSET_LANE]]
13890 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
13891   return vsetq_lane_s32(a, b, 3);
13892 }
13893
13894 // CHECK-LABEL: @test_vsetq_lane_p8(
13895 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
13896 // CHECK:   ret <16 x i8> [[VSET_LANE]]
13897 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
13898   return vsetq_lane_p8(a, b, 15);
13899 }
13900
13901 // CHECK-LABEL: @test_vsetq_lane_p16(
13902 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
13903 // CHECK:   ret <8 x i16> [[VSET_LANE]]
13904 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
13905   return vsetq_lane_p16(a, b, 7);
13906 }
13907
13908 // CHECK-LABEL: @test_vsetq_lane_f32(
13909 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> %b, float %a, i32 3
13910 // CHECK:   ret <4 x float> [[VSET_LANE]]
13911 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
13912   return vsetq_lane_f32(a, b, 3);
13913 }
13914
13915 // CHECK-LABEL: @test_vsetq_lane_f16(
13916 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
13917 // CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
13918 // CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
13919 // CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
13920 // CHECK:   store half [[TMP0]], ptr [[__REINT_248]], align 2
13921 // CHECK:   store <8 x half> %b, ptr [[__REINT1_248]], align 16
13922 // CHECK:   [[TMP2:%.*]] = load i16, ptr [[__REINT_248]], align 2
13923 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT1_248]], align 16
13924 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP2]], i32 3
13925 // CHECK:   store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_248]], align 16
13926 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[__REINT2_248]], align 16
13927 // CHECK:   ret <8 x half> [[TMP8]]
13928 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
13929   return vsetq_lane_f16(*a, b, 3);
13930 }
13931
13932 // CHECK-LABEL: @test_vset_lane_s64(
13933 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0
13934 // CHECK:   ret <1 x i64> [[VSET_LANE]]
13935 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
13936   return vset_lane_s64(a, b, 0);
13937 }
13938
13939 // CHECK-LABEL: @test_vset_lane_u64(
13940 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0
13941 // CHECK:   ret <1 x i64> [[VSET_LANE]]
13942 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
13943   return vset_lane_u64(a, b, 0);
13944 }
13945
13946 // CHECK-LABEL: @test_vsetq_lane_s64(
13947 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1
13948 // CHECK:   ret <2 x i64> [[VSET_LANE]]
13949 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
13950   return vsetq_lane_s64(a, b, 1);
13951 }
13952
13953 // CHECK-LABEL: @test_vsetq_lane_u64(
13954 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1
13955 // CHECK:   ret <2 x i64> [[VSET_LANE]]
13956 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
13957   return vsetq_lane_u64(a, b, 1);
13958 }
13959
13960 // CHECK-LABEL: @test_vshl_s8(
13961 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
13962 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
13963 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
13964   return vshl_s8(a, b);
13965 }
13966
13967 // CHECK-LABEL: @test_vshl_s16(
13968 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13969 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13970 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
13971 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
13972 // CHECK:   ret <4 x i16> [[VSHL_V2_I]]
13973 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
13974   return vshl_s16(a, b);
13975 }
13976
13977 // CHECK-LABEL: @test_vshl_s32(
13978 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13979 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13980 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
13981 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
13982 // CHECK:   ret <2 x i32> [[VSHL_V2_I]]
13983 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
13984   return vshl_s32(a, b);
13985 }
13986
13987 // CHECK-LABEL: @test_vshl_s64(
13988 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13989 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13990 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
13991 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
13992 // CHECK:   ret <1 x i64> [[VSHL_V2_I]]
13993 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
13994   return vshl_s64(a, b);
13995 }
13996
13997 // CHECK-LABEL: @test_vshl_u8(
13998 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
13999 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
14000 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
14001   return vshl_u8(a, b);
14002 }
14003
14004 // CHECK-LABEL: @test_vshl_u16(
14005 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14006 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14007 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
14008 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
14009 // CHECK:   ret <4 x i16> [[VSHL_V2_I]]
14010 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
14011   return vshl_u16(a, b);
14012 }
14013
14014 // CHECK-LABEL: @test_vshl_u32(
14015 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14016 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14017 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
14018 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
14019 // CHECK:   ret <2 x i32> [[VSHL_V2_I]]
14020 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
14021   return vshl_u32(a, b);
14022 }
14023
14024 // CHECK-LABEL: @test_vshl_u64(
14025 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14026 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14027 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
14028 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
14029 // CHECK:   ret <1 x i64> [[VSHL_V2_I]]
14030 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
14031   return vshl_u64(a, b);
14032 }
14033
14034 // CHECK-LABEL: @test_vshlq_s8(
14035 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
14036 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
14037 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
14038   return vshlq_s8(a, b);
14039 }
14040
14041 // CHECK-LABEL: @test_vshlq_s16(
14042 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14043 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14044 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
14045 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
14046 // CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
14047 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
14048   return vshlq_s16(a, b);
14049 }
14050
14051 // CHECK-LABEL: @test_vshlq_s32(
14052 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14053 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14054 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
14055 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
14056 // CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
14057 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
14058   return vshlq_s32(a, b);
14059 }
14060
14061 // CHECK-LABEL: @test_vshlq_s64(
14062 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14063 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14064 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
14065 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
14066 // CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
14067 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
14068   return vshlq_s64(a, b);
14069 }
14070
14071 // CHECK-LABEL: @test_vshlq_u8(
14072 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
14073 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
14074 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
14075   return vshlq_u8(a, b);
14076 }
14077
14078 // CHECK-LABEL: @test_vshlq_u16(
14079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14080 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14081 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
14082 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
14083 // CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
14084 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
14085   return vshlq_u16(a, b);
14086 }
14087
14088 // CHECK-LABEL: @test_vshlq_u32(
14089 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14090 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14091 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
14092 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
14093 // CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
14094 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
14095   return vshlq_u32(a, b);
14096 }
14097
14098 // CHECK-LABEL: @test_vshlq_u64(
14099 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14100 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14101 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
14102 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
14103 // CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
14104 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
14105   return vshlq_u64(a, b);
14106 }
14107
14108 // CHECK-LABEL: @test_vshll_n_s8(
14109 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
14110 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14111 // CHECK:   ret <8 x i16> [[VSHLL_N]]
14112 int16x8_t test_vshll_n_s8(int8x8_t a) {
14113   return vshll_n_s8(a, 1);
14114 }
14115
14116 // CHECK-LABEL: @test_vshll_n_s16(
14117 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14118 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14119 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
14120 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
14121 // CHECK:   ret <4 x i32> [[VSHLL_N]]
14122 int32x4_t test_vshll_n_s16(int16x4_t a) {
14123   return vshll_n_s16(a, 1);
14124 }
14125
14126 // CHECK-LABEL: @test_vshll_n_s32(
14127 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14128 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14129 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
14130 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
14131 // CHECK:   ret <2 x i64> [[VSHLL_N]]
14132 int64x2_t test_vshll_n_s32(int32x2_t a) {
14133   return vshll_n_s32(a, 1);
14134 }
14135
14136 // CHECK-LABEL: @test_vshll_n_u8(
14137 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
14138 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14139 // CHECK:   ret <8 x i16> [[VSHLL_N]]
14140 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
14141   return vshll_n_u8(a, 1);
14142 }
14143
14144 // CHECK-LABEL: @test_vshll_n_u16(
14145 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14146 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14147 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
14148 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
14149 // CHECK:   ret <4 x i32> [[VSHLL_N]]
14150 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
14151   return vshll_n_u16(a, 1);
14152 }
14153
14154 // CHECK-LABEL: @test_vshll_n_u32(
14155 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14156 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14157 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
14158 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
14159 // CHECK:   ret <2 x i64> [[VSHLL_N]]
14160 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
14161   return vshll_n_u32(a, 1);
14162 }
14163
14164 // CHECK-LABEL: @test_vshl_n_s8(
14165 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14166 // CHECK:   ret <8 x i8> [[VSHL_N]]
14167 int8x8_t test_vshl_n_s8(int8x8_t a) {
14168   return vshl_n_s8(a, 1);
14169 }
14170
14171 // CHECK-LABEL: @test_vshl_n_s16(
14172 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14173 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14174 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14175 // CHECK:   ret <4 x i16> [[VSHL_N]]
14176 int16x4_t test_vshl_n_s16(int16x4_t a) {
14177   return vshl_n_s16(a, 1);
14178 }
14179
14180 // CHECK-LABEL: @test_vshl_n_s32(
14181 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14182 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14183 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
14184 // CHECK:   ret <2 x i32> [[VSHL_N]]
14185 int32x2_t test_vshl_n_s32(int32x2_t a) {
14186   return vshl_n_s32(a, 1);
14187 }
14188
14189 // CHECK-LABEL: @test_vshl_n_s64(
14190 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14191 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14192 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
14193 // CHECK:   ret <1 x i64> [[VSHL_N]]
14194 int64x1_t test_vshl_n_s64(int64x1_t a) {
14195   return vshl_n_s64(a, 1);
14196 }
14197
14198 // CHECK-LABEL: @test_vshl_n_u8(
14199 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14200 // CHECK:   ret <8 x i8> [[VSHL_N]]
14201 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
14202   return vshl_n_u8(a, 1);
14203 }
14204
14205 // CHECK-LABEL: @test_vshl_n_u16(
14206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14207 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14208 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14209 // CHECK:   ret <4 x i16> [[VSHL_N]]
14210 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
14211   return vshl_n_u16(a, 1);
14212 }
14213
14214 // CHECK-LABEL: @test_vshl_n_u32(
14215 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14216 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14217 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
14218 // CHECK:   ret <2 x i32> [[VSHL_N]]
14219 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
14220   return vshl_n_u32(a, 1);
14221 }
14222
14223 // CHECK-LABEL: @test_vshl_n_u64(
14224 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14225 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14226 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
14227 // CHECK:   ret <1 x i64> [[VSHL_N]]
14228 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
14229   return vshl_n_u64(a, 1);
14230 }
14231
14232 // CHECK-LABEL: @test_vshlq_n_s8(
14233 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14234 // CHECK:   ret <16 x i8> [[VSHL_N]]
14235 int8x16_t test_vshlq_n_s8(int8x16_t a) {
14236   return vshlq_n_s8(a, 1);
14237 }
14238
14239 // CHECK-LABEL: @test_vshlq_n_s16(
14240 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14241 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14242 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14243 // CHECK:   ret <8 x i16> [[VSHL_N]]
14244 int16x8_t test_vshlq_n_s16(int16x8_t a) {
14245   return vshlq_n_s16(a, 1);
14246 }
14247
14248 // CHECK-LABEL: @test_vshlq_n_s32(
14249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14250 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14251 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14252 // CHECK:   ret <4 x i32> [[VSHL_N]]
14253 int32x4_t test_vshlq_n_s32(int32x4_t a) {
14254   return vshlq_n_s32(a, 1);
14255 }
14256
14257 // CHECK-LABEL: @test_vshlq_n_s64(
14258 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14259 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14260 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
14261 // CHECK:   ret <2 x i64> [[VSHL_N]]
14262 int64x2_t test_vshlq_n_s64(int64x2_t a) {
14263   return vshlq_n_s64(a, 1);
14264 }
14265
14266 // CHECK-LABEL: @test_vshlq_n_u8(
14267 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14268 // CHECK:   ret <16 x i8> [[VSHL_N]]
14269 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
14270   return vshlq_n_u8(a, 1);
14271 }
14272
14273 // CHECK-LABEL: @test_vshlq_n_u16(
14274 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14275 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14276 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14277 // CHECK:   ret <8 x i16> [[VSHL_N]]
14278 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
14279   return vshlq_n_u16(a, 1);
14280 }
14281
14282 // CHECK-LABEL: @test_vshlq_n_u32(
14283 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14284 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14285 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14286 // CHECK:   ret <4 x i32> [[VSHL_N]]
14287 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
14288   return vshlq_n_u32(a, 1);
14289 }
14290
14291 // CHECK-LABEL: @test_vshlq_n_u64(
14292 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14293 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14294 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
14295 // CHECK:   ret <2 x i64> [[VSHL_N]]
14296 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
14297   return vshlq_n_u64(a, 1);
14298 }
14299
14300 // CHECK-LABEL: @test_vshrn_n_s16(
14301 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14302 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14303 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14304 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
14305 // CHECK:   ret <8 x i8> [[VSHRN_N]]
14306 int8x8_t test_vshrn_n_s16(int16x8_t a) {
14307   return vshrn_n_s16(a, 1);
14308 }
14309
14310 // CHECK-LABEL: @test_vshrn_n_s32(
14311 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14312 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14313 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14314 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
14315 // CHECK:   ret <4 x i16> [[VSHRN_N]]
14316 int16x4_t test_vshrn_n_s32(int32x4_t a) {
14317   return vshrn_n_s32(a, 1);
14318 }
14319
14320 // CHECK-LABEL: @test_vshrn_n_s64(
14321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14322 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14323 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
14324 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
14325 // CHECK:   ret <2 x i32> [[VSHRN_N]]
14326 int32x2_t test_vshrn_n_s64(int64x2_t a) {
14327   return vshrn_n_s64(a, 1);
14328 }
14329
14330 // CHECK-LABEL: @test_vshrn_n_u16(
14331 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14332 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14333 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14334 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
14335 // CHECK:   ret <8 x i8> [[VSHRN_N]]
14336 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
14337   return vshrn_n_u16(a, 1);
14338 }
14339
14340 // CHECK-LABEL: @test_vshrn_n_u32(
14341 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14342 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14343 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14344 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
14345 // CHECK:   ret <4 x i16> [[VSHRN_N]]
14346 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
14347   return vshrn_n_u32(a, 1);
14348 }
14349
14350 // CHECK-LABEL: @test_vshrn_n_u64(
14351 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14352 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14353 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
14354 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
14355 // CHECK:   ret <2 x i32> [[VSHRN_N]]
14356 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
14357   return vshrn_n_u64(a, 1);
14358 }
14359
14360 // CHECK-LABEL: @test_vshr_n_s8(
14361 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14362 // CHECK:   ret <8 x i8> [[VSHR_N]]
14363 int8x8_t test_vshr_n_s8(int8x8_t a) {
14364   return vshr_n_s8(a, 1);
14365 }
14366
14367 // CHECK-LABEL: @test_vshr_n_s16(
14368 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14369 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14370 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14371 // CHECK:   ret <4 x i16> [[VSHR_N]]
14372 int16x4_t test_vshr_n_s16(int16x4_t a) {
14373   return vshr_n_s16(a, 1);
14374 }
14375
14376 // CHECK-LABEL: @test_vshr_n_s32(
14377 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14378 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14379 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
14380 // CHECK:   ret <2 x i32> [[VSHR_N]]
14381 int32x2_t test_vshr_n_s32(int32x2_t a) {
14382   return vshr_n_s32(a, 1);
14383 }
14384
14385 // CHECK-LABEL: @test_vshr_n_s64(
14386 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14387 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14388 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
14389 // CHECK:   ret <1 x i64> [[VSHR_N]]
14390 int64x1_t test_vshr_n_s64(int64x1_t a) {
14391   return vshr_n_s64(a, 1);
14392 }
14393
14394 // CHECK-LABEL: @test_vshr_n_u8(
14395 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14396 // CHECK:   ret <8 x i8> [[VSHR_N]]
14397 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
14398   return vshr_n_u8(a, 1);
14399 }
14400
14401 // CHECK-LABEL: @test_vshr_n_u16(
14402 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14403 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14404 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14405 // CHECK:   ret <4 x i16> [[VSHR_N]]
14406 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
14407   return vshr_n_u16(a, 1);
14408 }
14409
14410 // CHECK-LABEL: @test_vshr_n_u32(
14411 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14412 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14413 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
14414 // CHECK:   ret <2 x i32> [[VSHR_N]]
14415 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
14416   return vshr_n_u32(a, 1);
14417 }
14418
14419 // CHECK-LABEL: @test_vshr_n_u64(
14420 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14421 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14422 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
14423 // CHECK:   ret <1 x i64> [[VSHR_N]]
14424 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
14425   return vshr_n_u64(a, 1);
14426 }
14427
14428 // CHECK-LABEL: @test_vshrq_n_s8(
14429 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14430 // CHECK:   ret <16 x i8> [[VSHR_N]]
14431 int8x16_t test_vshrq_n_s8(int8x16_t a) {
14432   return vshrq_n_s8(a, 1);
14433 }
14434
14435 // CHECK-LABEL: @test_vshrq_n_s16(
14436 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14437 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14438 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14439 // CHECK:   ret <8 x i16> [[VSHR_N]]
14440 int16x8_t test_vshrq_n_s16(int16x8_t a) {
14441   return vshrq_n_s16(a, 1);
14442 }
14443
14444 // CHECK-LABEL: @test_vshrq_n_s32(
14445 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14446 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14447 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14448 // CHECK:   ret <4 x i32> [[VSHR_N]]
14449 int32x4_t test_vshrq_n_s32(int32x4_t a) {
14450   return vshrq_n_s32(a, 1);
14451 }
14452
14453 // CHECK-LABEL: @test_vshrq_n_s64(
14454 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14455 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14456 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
14457 // CHECK:   ret <2 x i64> [[VSHR_N]]
14458 int64x2_t test_vshrq_n_s64(int64x2_t a) {
14459   return vshrq_n_s64(a, 1);
14460 }
14461
14462 // CHECK-LABEL: @test_vshrq_n_u8(
14463 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14464 // CHECK:   ret <16 x i8> [[VSHR_N]]
14465 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
14466   return vshrq_n_u8(a, 1);
14467 }
14468
14469 // CHECK-LABEL: @test_vshrq_n_u16(
14470 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14471 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14472 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14473 // CHECK:   ret <8 x i16> [[VSHR_N]]
14474 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
14475   return vshrq_n_u16(a, 1);
14476 }
14477
14478 // CHECK-LABEL: @test_vshrq_n_u32(
14479 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14480 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14481 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14482 // CHECK:   ret <4 x i32> [[VSHR_N]]
14483 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
14484   return vshrq_n_u32(a, 1);
14485 }
14486
14487 // CHECK-LABEL: @test_vshrq_n_u64(
14488 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14489 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14490 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
14491 // CHECK:   ret <2 x i64> [[VSHR_N]]
14492 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
14493   return vshrq_n_u64(a, 1);
14494 }
14495
14496 // CHECK-LABEL: @test_vsli_n_s8(
14497 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14498 // CHECK:   ret <8 x i8> [[VSLI_N]]
14499 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
14500   return vsli_n_s8(a, b, 1);
14501 }
14502
14503 // CHECK-LABEL: @test_vsli_n_s16(
14504 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14505 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14506 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14507 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14508 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14509 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14510 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
14511   return vsli_n_s16(a, b, 1);
14512 }
14513
14514 // CHECK-LABEL: @test_vsli_n_s32(
14515 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14516 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14517 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14518 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14519 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
14520 // CHECK:   ret <2 x i32> [[VSLI_N2]]
14521 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
14522   return vsli_n_s32(a, b, 1);
14523 }
14524
14525 // CHECK-LABEL: @test_vsli_n_s64(
14526 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14527 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14528 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14529 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14530 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
14531 // CHECK:   ret <1 x i64> [[VSLI_N2]]
14532 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
14533   return vsli_n_s64(a, b, 1);
14534 }
14535
14536 // CHECK-LABEL: @test_vsli_n_u8(
14537 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14538 // CHECK:   ret <8 x i8> [[VSLI_N]]
14539 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
14540   return vsli_n_u8(a, b, 1);
14541 }
14542
14543 // CHECK-LABEL: @test_vsli_n_u16(
14544 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14545 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14546 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14547 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14548 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14549 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14550 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
14551   return vsli_n_u16(a, b, 1);
14552 }
14553
14554 // CHECK-LABEL: @test_vsli_n_u32(
14555 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14556 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14557 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14558 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14559 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
14560 // CHECK:   ret <2 x i32> [[VSLI_N2]]
14561 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
14562   return vsli_n_u32(a, b, 1);
14563 }
14564
14565 // CHECK-LABEL: @test_vsli_n_u64(
14566 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14567 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14568 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14569 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14570 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
14571 // CHECK:   ret <1 x i64> [[VSLI_N2]]
14572 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
14573   return vsli_n_u64(a, b, 1);
14574 }
14575
14576 // CHECK-LABEL: @test_vsli_n_p8(
14577 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14578 // CHECK:   ret <8 x i8> [[VSLI_N]]
14579 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
14580   return vsli_n_p8(a, b, 1);
14581 }
14582
14583 // CHECK-LABEL: @test_vsli_n_p16(
14584 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14585 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14586 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14587 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14588 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14589 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14590 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
14591   return vsli_n_p16(a, b, 1);
14592 }
14593
14594 // CHECK-LABEL: @test_vsliq_n_s8(
14595 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14596 // CHECK:   ret <16 x i8> [[VSLI_N]]
14597 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
14598   return vsliq_n_s8(a, b, 1);
14599 }
14600
14601 // CHECK-LABEL: @test_vsliq_n_s16(
14602 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14603 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14604 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14605 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14606 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
14607 // CHECK:   ret <8 x i16> [[VSLI_N2]]
14608 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
14609   return vsliq_n_s16(a, b, 1);
14610 }
14611
14612 // CHECK-LABEL: @test_vsliq_n_s32(
14613 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14614 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14615 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14616 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14617 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
14618 // CHECK:   ret <4 x i32> [[VSLI_N2]]
14619 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
14620   return vsliq_n_s32(a, b, 1);
14621 }
14622
14623 // CHECK-LABEL: @test_vsliq_n_s64(
14624 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14625 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14626 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14627 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14628 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
14629 // CHECK:   ret <2 x i64> [[VSLI_N2]]
14630 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
14631   return vsliq_n_s64(a, b, 1);
14632 }
14633
14634 // CHECK-LABEL: @test_vsliq_n_u8(
14635 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14636 // CHECK:   ret <16 x i8> [[VSLI_N]]
14637 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
14638   return vsliq_n_u8(a, b, 1);
14639 }
14640
14641 // CHECK-LABEL: @test_vsliq_n_u16(
14642 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14643 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14644 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14645 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14646 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
14647 // CHECK:   ret <8 x i16> [[VSLI_N2]]
14648 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
14649   return vsliq_n_u16(a, b, 1);
14650 }
14651
14652 // CHECK-LABEL: @test_vsliq_n_u32(
14653 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14654 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14655 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14656 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14657 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
14658 // CHECK:   ret <4 x i32> [[VSLI_N2]]
14659 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
14660   return vsliq_n_u32(a, b, 1);
14661 }
14662
14663 // CHECK-LABEL: @test_vsliq_n_u64(
14664 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14665 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14666 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14667 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14668 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
14669 // CHECK:   ret <2 x i64> [[VSLI_N2]]
14670 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
14671   return vsliq_n_u64(a, b, 1);
14672 }
14673
14674 // CHECK-LABEL: @test_vsliq_n_p8(
14675 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14676 // CHECK:   ret <16 x i8> [[VSLI_N]]
14677 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
14678   return vsliq_n_p8(a, b, 1);
14679 }
14680
14681 // CHECK-LABEL: @test_vsliq_n_p16(
14682 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14683 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14684 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14685 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14686 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
14687 // CHECK:   ret <8 x i16> [[VSLI_N2]]
14688 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
14689   return vsliq_n_p16(a, b, 1);
14690 }
14691
14692 // CHECK-LABEL: @test_vsra_n_s8(
14693 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14694 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
14695 // CHECK:   ret <8 x i8> [[TMP0]]
14696 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
14697   return vsra_n_s8(a, b, 1);
14698 }
14699
14700 // CHECK-LABEL: @test_vsra_n_s16(
14701 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14702 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14703 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14704 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14705 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
14706 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
14707 // CHECK:   ret <4 x i16> [[TMP4]]
14708 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
14709   return vsra_n_s16(a, b, 1);
14710 }
14711
14712 // CHECK-LABEL: @test_vsra_n_s32(
14713 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14714 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14715 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14716 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14717 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
14718 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
14719 // CHECK:   ret <2 x i32> [[TMP4]]
14720 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
14721   return vsra_n_s32(a, b, 1);
14722 }
14723
14724 // CHECK-LABEL: @test_vsra_n_s64(
14725 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14726 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14727 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14728 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14729 // CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
14730 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
14731 // CHECK:   ret <1 x i64> [[TMP4]]
14732 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
14733   return vsra_n_s64(a, b, 1);
14734 }
14735
14736 // CHECK-LABEL: @test_vsra_n_u8(
14737 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14738 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
14739 // CHECK:   ret <8 x i8> [[TMP0]]
14740 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
14741   return vsra_n_u8(a, b, 1);
14742 }
14743
14744 // CHECK-LABEL: @test_vsra_n_u16(
14745 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14746 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14747 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14748 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14749 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
14750 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
14751 // CHECK:   ret <4 x i16> [[TMP4]]
14752 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
14753   return vsra_n_u16(a, b, 1);
14754 }
14755
14756 // CHECK-LABEL: @test_vsra_n_u32(
14757 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14758 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14759 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14760 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14761 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
14762 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
14763 // CHECK:   ret <2 x i32> [[TMP4]]
14764 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
14765   return vsra_n_u32(a, b, 1);
14766 }
14767
14768 // CHECK-LABEL: @test_vsra_n_u64(
14769 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14770 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14771 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14772 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14773 // CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
14774 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
14775 // CHECK:   ret <1 x i64> [[TMP4]]
14776 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
14777   return vsra_n_u64(a, b, 1);
14778 }
14779
14780 // CHECK-LABEL: @test_vsraq_n_s8(
14781 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14782 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
14783 // CHECK:   ret <16 x i8> [[TMP0]]
14784 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
14785   return vsraq_n_s8(a, b, 1);
14786 }
14787
14788 // CHECK-LABEL: @test_vsraq_n_s16(
14789 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14790 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14791 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14792 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14793 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14794 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
14795 // CHECK:   ret <8 x i16> [[TMP4]]
14796 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
14797   return vsraq_n_s16(a, b, 1);
14798 }
14799
14800 // CHECK-LABEL: @test_vsraq_n_s32(
14801 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14802 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14803 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14804 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14805 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
14806 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
14807 // CHECK:   ret <4 x i32> [[TMP4]]
14808 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
14809   return vsraq_n_s32(a, b, 1);
14810 }
14811
14812 // CHECK-LABEL: @test_vsraq_n_s64(
14813 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14814 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14815 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14816 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14817 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
14818 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
14819 // CHECK:   ret <2 x i64> [[TMP4]]
14820 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
14821   return vsraq_n_s64(a, b, 1);
14822 }
14823
14824 // CHECK-LABEL: @test_vsraq_n_u8(
14825 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14826 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
14827 // CHECK:   ret <16 x i8> [[TMP0]]
14828 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
14829   return vsraq_n_u8(a, b, 1);
14830 }
14831
14832 // CHECK-LABEL: @test_vsraq_n_u16(
14833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14834 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14835 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14836 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14837 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14838 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
14839 // CHECK:   ret <8 x i16> [[TMP4]]
14840 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
14841   return vsraq_n_u16(a, b, 1);
14842 }
14843
14844 // CHECK-LABEL: @test_vsraq_n_u32(
14845 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14846 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14847 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14848 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14849 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
14850 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
14851 // CHECK:   ret <4 x i32> [[TMP4]]
14852 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
14853   return vsraq_n_u32(a, b, 1);
14854 }
14855
14856 // CHECK-LABEL: @test_vsraq_n_u64(
14857 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14858 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14859 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14860 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14861 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
14862 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
14863 // CHECK:   ret <2 x i64> [[TMP4]]
14864 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
14865   return vsraq_n_u64(a, b, 1);
14866 }
14867
14868 // CHECK-LABEL: @test_vsri_n_s8(
14869 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
14870 // CHECK:   ret <8 x i8> [[VSLI_N]]
14871 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
14872   return vsri_n_s8(a, b, 1);
14873 }
14874
14875 // CHECK-LABEL: @test_vsri_n_s16(
14876 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14877 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14878 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14879 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14880 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
14881 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14882 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
14883   return vsri_n_s16(a, b, 1);
14884 }
14885
14886 // CHECK-LABEL: @test_vsri_n_s32(
14887 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14888 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14889 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14890 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14891 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
14892 // CHECK:   ret <2 x i32> [[VSLI_N2]]
14893 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
14894   return vsri_n_s32(a, b, 1);
14895 }
14896
14897 // CHECK-LABEL: @test_vsri_n_s64(
14898 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14899 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14900 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14901 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14902 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
14903 // CHECK:   ret <1 x i64> [[VSLI_N2]]
14904 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
14905   return vsri_n_s64(a, b, 1);
14906 }
14907
14908 // CHECK-LABEL: @test_vsri_n_u8(
14909 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
14910 // CHECK:   ret <8 x i8> [[VSLI_N]]
14911 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
14912   return vsri_n_u8(a, b, 1);
14913 }
14914
14915 // CHECK-LABEL: @test_vsri_n_u16(
14916 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14917 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14918 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14919 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14920 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
14921 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14922 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
14923   return vsri_n_u16(a, b, 1);
14924 }
14925
14926 // CHECK-LABEL: @test_vsri_n_u32(
14927 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14928 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14929 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14930 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14931 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
14932 // CHECK:   ret <2 x i32> [[VSLI_N2]]
14933 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
14934   return vsri_n_u32(a, b, 1);
14935 }
14936
14937 // CHECK-LABEL: @test_vsri_n_u64(
14938 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14939 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14940 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14941 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14942 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
14943 // CHECK:   ret <1 x i64> [[VSLI_N2]]
14944 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
14945   return vsri_n_u64(a, b, 1);
14946 }
14947
14948 // CHECK-LABEL: @test_vsri_n_p8(
14949 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
14950 // CHECK:   ret <8 x i8> [[VSLI_N]]
14951 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
14952   return vsri_n_p8(a, b, 1);
14953 }
14954
14955 // CHECK-LABEL: @test_vsri_n_p16(
14956 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14957 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14958 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14959 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14960 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
14961 // CHECK:   ret <4 x i16> [[VSLI_N2]]
14962 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
14963   return vsri_n_p16(a, b, 1);
14964 }
14965
14966 // CHECK-LABEL: @test_vsriq_n_s8(
14967 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
14968 // CHECK:   ret <16 x i8> [[VSLI_N]]
14969 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
14970   return vsriq_n_s8(a, b, 1);
14971 }
14972
14973 // CHECK-LABEL: @test_vsriq_n_s16(
14974 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14975 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14976 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14977 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14978 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
14979 // CHECK:   ret <8 x i16> [[VSLI_N2]]
14980 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
14981   return vsriq_n_s16(a, b, 1);
14982 }
14983
14984 // CHECK-LABEL: @test_vsriq_n_s32(
14985 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14986 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14987 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14988 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14989 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
14990 // CHECK:   ret <4 x i32> [[VSLI_N2]]
14991 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
14992   return vsriq_n_s32(a, b, 1);
14993 }
14994
14995 // CHECK-LABEL: @test_vsriq_n_s64(
14996 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14997 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14998 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14999 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15000 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
15001 // CHECK:   ret <2 x i64> [[VSLI_N2]]
15002 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
15003   return vsriq_n_s64(a, b, 1);
15004 }
15005
15006 // CHECK-LABEL: @test_vsriq_n_u8(
15007 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15008 // CHECK:   ret <16 x i8> [[VSLI_N]]
15009 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
15010   return vsriq_n_u8(a, b, 1);
15011 }
15012
15013 // CHECK-LABEL: @test_vsriq_n_u16(
15014 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15015 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15016 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15017 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15018 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
15019 // CHECK:   ret <8 x i16> [[VSLI_N2]]
15020 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
15021   return vsriq_n_u16(a, b, 1);
15022 }
15023
15024 // CHECK-LABEL: @test_vsriq_n_u32(
15025 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15026 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15027 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15028 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15029 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
15030 // CHECK:   ret <4 x i32> [[VSLI_N2]]
15031 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
15032   return vsriq_n_u32(a, b, 1);
15033 }
15034
15035 // CHECK-LABEL: @test_vsriq_n_u64(
15036 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15037 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15038 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
15039 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15040 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
15041 // CHECK:   ret <2 x i64> [[VSLI_N2]]
15042 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
15043   return vsriq_n_u64(a, b, 1);
15044 }
15045
15046 // CHECK-LABEL: @test_vsriq_n_p8(
15047 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15048 // CHECK:   ret <16 x i8> [[VSLI_N]]
15049 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
15050   return vsriq_n_p8(a, b, 1);
15051 }
15052
15053 // CHECK-LABEL: @test_vsriq_n_p16(
15054 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15055 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15056 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15057 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15058 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
15059 // CHECK:   ret <8 x i16> [[VSLI_N2]]
15060 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
15061   return vsriq_n_p16(a, b, 1);
15062 }
15063
15064 // CHECK-LABEL: @test_vst1q_u8(
15065 // CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
15066 // CHECK:   ret void
15067 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
15068   vst1q_u8(a, b);
15069 }
15070
15071 // CHECK-LABEL: @test_vst1q_u16(
15072 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15073 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15074 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
15075 // CHECK:   ret void
15076 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
15077   vst1q_u16(a, b);
15078 }
15079
15080 // CHECK-LABEL: @test_vst1q_u32(
15081 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15082 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15083 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4)
15084 // CHECK:   ret void
15085 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
15086   vst1q_u32(a, b);
15087 }
15088
15089 // CHECK-LABEL: @test_vst1q_u64(
15090 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15091 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15092 // CHECK:   call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4)
15093 // CHECK:   ret void
15094 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
15095   vst1q_u64(a, b);
15096 }
15097
15098 // CHECK-LABEL: @test_vst1q_s8(
15099 // CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
15100 // CHECK:   ret void
15101 void test_vst1q_s8(int8_t * a, int8x16_t b) {
15102   vst1q_s8(a, b);
15103 }
15104
15105 // CHECK-LABEL: @test_vst1q_s16(
15106 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15107 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15108 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
15109 // CHECK:   ret void
15110 void test_vst1q_s16(int16_t * a, int16x8_t b) {
15111   vst1q_s16(a, b);
15112 }
15113
15114 // CHECK-LABEL: @test_vst1q_s32(
15115 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15116 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15117 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4)
15118 // CHECK:   ret void
15119 void test_vst1q_s32(int32_t * a, int32x4_t b) {
15120   vst1q_s32(a, b);
15121 }
15122
15123 // CHECK-LABEL: @test_vst1q_s64(
15124 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15125 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15126 // CHECK:   call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4)
15127 // CHECK:   ret void
15128 void test_vst1q_s64(int64_t * a, int64x2_t b) {
15129   vst1q_s64(a, b);
15130 }
15131
15132 // CHECK-LABEL: @test_vst1q_f16(
15133 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
15134 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
15135 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8f16(ptr %a, <8 x half> [[TMP2]], i32 2)
15136 // CHECK:   ret void
15137 void test_vst1q_f16(float16_t * a, float16x8_t b) {
15138   vst1q_f16(a, b);
15139 }
15140
15141 // CHECK-LABEL: @test_vst1q_f32(
15142 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
15143 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
15144 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4f32(ptr %a, <4 x float> [[TMP2]], i32 4)
15145 // CHECK:   ret void
15146 void test_vst1q_f32(float32_t * a, float32x4_t b) {
15147   vst1q_f32(a, b);
15148 }
15149
15150 // CHECK-LABEL: @test_vst1q_p8(
15151 // CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
15152 // CHECK:   ret void
15153 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
15154   vst1q_p8(a, b);
15155 }
15156
15157 // CHECK-LABEL: @test_vst1q_p16(
15158 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15159 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15160 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
15161 // CHECK:   ret void
15162 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
15163   vst1q_p16(a, b);
15164 }
15165
15166 // CHECK-LABEL: @test_vst1_u8(
15167 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
15168 // CHECK:   ret void
15169 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
15170   vst1_u8(a, b);
15171 }
15172
15173 // CHECK-LABEL: @test_vst1_u16(
15174 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15175 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15176 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
15177 // CHECK:   ret void
15178 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
15179   vst1_u16(a, b);
15180 }
15181
15182 // CHECK-LABEL: @test_vst1_u32(
15183 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15184 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15185 // CHECK:   call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4)
15186 // CHECK:   ret void
15187 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
15188   vst1_u32(a, b);
15189 }
15190
15191 // CHECK-LABEL: @test_vst1_u64(
15192 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15193 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15194 // CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4)
15195 // CHECK:   ret void
15196 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
15197   vst1_u64(a, b);
15198 }
15199
15200 // CHECK-LABEL: @test_vst1_s8(
15201 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
15202 // CHECK:   ret void
15203 void test_vst1_s8(int8_t * a, int8x8_t b) {
15204   vst1_s8(a, b);
15205 }
15206
15207 // CHECK-LABEL: @test_vst1_s16(
15208 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15209 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15210 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
15211 // CHECK:   ret void
15212 void test_vst1_s16(int16_t * a, int16x4_t b) {
15213   vst1_s16(a, b);
15214 }
15215
15216 // CHECK-LABEL: @test_vst1_s32(
15217 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15218 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15219 // CHECK:   call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4)
15220 // CHECK:   ret void
15221 void test_vst1_s32(int32_t * a, int32x2_t b) {
15222   vst1_s32(a, b);
15223 }
15224
15225 // CHECK-LABEL: @test_vst1_s64(
15226 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15227 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15228 // CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4)
15229 // CHECK:   ret void
15230 void test_vst1_s64(int64_t * a, int64x1_t b) {
15231   vst1_s64(a, b);
15232 }
15233
15234 // CHECK-LABEL: @test_vst1_f16(
15235 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
15236 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
15237 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4f16(ptr %a, <4 x half> [[TMP2]], i32 2)
15238 // CHECK:   ret void
15239 void test_vst1_f16(float16_t * a, float16x4_t b) {
15240   vst1_f16(a, b);
15241 }
15242
15243 // CHECK-LABEL: @test_vst1_f32(
15244 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
15245 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
15246 // CHECK:   call void @llvm.arm.neon.vst1.p0.v2f32(ptr %a, <2 x float> [[TMP2]], i32 4)
15247 // CHECK:   ret void
15248 void test_vst1_f32(float32_t * a, float32x2_t b) {
15249   vst1_f32(a, b);
15250 }
15251
15252 // CHECK-LABEL: @test_vst1_p8(
15253 // CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
15254 // CHECK:   ret void
15255 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
15256   vst1_p8(a, b);
15257 }
15258
15259 // CHECK-LABEL: @test_vst1_p16(
15260 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15261 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15262 // CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
15263 // CHECK:   ret void
15264 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
15265   vst1_p16(a, b);
15266 }
15267
15268 // CHECK-LABEL: @test_vst1q_lane_u8(
15269 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15270 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15271 // CHECK:   ret void
15272 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
15273   vst1q_lane_u8(a, b, 15);
15274 }
15275
15276 // CHECK-LABEL: @test_vst1q_lane_u16(
15277 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15278 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15279 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15280 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15281 // CHECK:   ret void
15282 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
15283   vst1q_lane_u16(a, b, 7);
15284 }
15285
15286 // CHECK-LABEL: @test_vst1q_lane_u32(
15287 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15288 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15289 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
15290 // CHECK:   store i32 [[TMP3]], ptr %a, align 4
15291 // CHECK:   ret void
15292 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
15293   vst1q_lane_u32(a, b, 3);
15294 }
15295
15296 // CHECK-LABEL: @test_vst1q_lane_u64(
15297 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15298 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15299 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
15300 // CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4)
15301 // CHECK:   ret void
15302 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
15303   vst1q_lane_u64(a, b, 1);
15304 }
15305
15306 // CHECK-LABEL: @test_vst1q_lane_s8(
15307 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15308 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15309 // CHECK:   ret void
15310 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
15311   vst1q_lane_s8(a, b, 15);
15312 }
15313
15314 // CHECK-LABEL: @test_vst1q_lane_s16(
15315 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15316 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15317 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15318 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15319 // CHECK:   ret void
15320 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
15321   vst1q_lane_s16(a, b, 7);
15322 }
15323
15324 // CHECK-LABEL: @test_vst1q_lane_s32(
15325 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15326 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15327 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
15328 // CHECK:   store i32 [[TMP3]], ptr %a, align 4
15329 // CHECK:   ret void
15330 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
15331   vst1q_lane_s32(a, b, 3);
15332 }
15333
15334 // CHECK-LABEL: @test_vst1q_lane_s64(
15335 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15336 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15337 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
15338 // CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4)
15339 // CHECK:   ret void
15340 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
15341   vst1q_lane_s64(a, b, 1);
15342 }
15343
15344 // CHECK-LABEL: @test_vst1q_lane_f16(
15345 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
15346 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
15347 // CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
15348 // CHECK:   store half [[TMP3]], ptr %a, align 2
15349 // CHECK:   ret void
15350 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
15351   vst1q_lane_f16(a, b, 7);
15352 }
15353
15354 // CHECK-LABEL: @test_vst1q_lane_f32(
15355 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
15356 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
15357 // CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
15358 // CHECK:   store float [[TMP3]], ptr %a, align 4
15359 // CHECK:   ret void
15360 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
15361   vst1q_lane_f32(a, b, 3);
15362 }
15363
15364 // CHECK-LABEL: @test_vst1q_lane_p8(
15365 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15366 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15367 // CHECK:   ret void
15368 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
15369   vst1q_lane_p8(a, b, 15);
15370 }
15371
15372 // CHECK-LABEL: @test_vst1q_lane_p16(
15373 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15374 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15375 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15376 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15377 // CHECK:   ret void
15378 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
15379   vst1q_lane_p16(a, b, 7);
15380 }
15381
15382 // CHECK-LABEL: @test_vst1_lane_u8(
15383 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15384 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15385 // CHECK:   ret void
15386 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
15387   vst1_lane_u8(a, b, 7);
15388 }
15389
15390 // CHECK-LABEL: @test_vst1_lane_u16(
15391 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15392 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15393 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15394 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15395 // CHECK:   ret void
15396 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
15397   vst1_lane_u16(a, b, 3);
15398 }
15399
15400 // CHECK-LABEL: @test_vst1_lane_u32(
15401 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15402 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15403 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
15404 // CHECK:   store i32 [[TMP3]], ptr %a, align 4
15405 // CHECK:   ret void
15406 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
15407   vst1_lane_u32(a, b, 1);
15408 }
15409
15410 // CHECK-LABEL: @test_vst1_lane_u64(
15411 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15412 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15413 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
15414 // CHECK:   store i64 [[TMP3]], ptr %a, align 4
15415 // CHECK:   ret void
15416 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
15417   vst1_lane_u64(a, b, 0);
15418 }
15419
15420 // CHECK-LABEL: @test_vst1_lane_s8(
15421 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15422 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15423 // CHECK:   ret void
15424 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
15425   vst1_lane_s8(a, b, 7);
15426 }
15427
15428 // CHECK-LABEL: @test_vst1_lane_s16(
15429 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15430 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15431 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15432 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15433 // CHECK:   ret void
15434 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
15435   vst1_lane_s16(a, b, 3);
15436 }
15437
15438 // CHECK-LABEL: @test_vst1_lane_s32(
15439 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15440 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15441 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
15442 // CHECK:   store i32 [[TMP3]], ptr %a, align 4
15443 // CHECK:   ret void
15444 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
15445   vst1_lane_s32(a, b, 1);
15446 }
15447
15448 // CHECK-LABEL: @test_vst1_lane_s64(
15449 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15450 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15451 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
15452 // CHECK:   store i64 [[TMP3]], ptr %a, align 4
15453 // CHECK:   ret void
15454 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
15455   vst1_lane_s64(a, b, 0);
15456 }
15457
15458 // CHECK-LABEL: @test_vst1_lane_f16(
15459 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
15460 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
15461 // CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
15462 // CHECK:   store half [[TMP3]], ptr %a, align 2
15463 // CHECK:   ret void
15464 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
15465   vst1_lane_f16(a, b, 3);
15466 }
15467
15468 // CHECK-LABEL: @test_vst1_lane_f32(
15469 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
15470 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
15471 // CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
15472 // CHECK:   store float [[TMP3]], ptr %a, align 4
15473 // CHECK:   ret void
15474 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
15475   vst1_lane_f32(a, b, 1);
15476 }
15477
15478 // CHECK-LABEL: @test_vst1_lane_p8(
15479 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15480 // CHECK:   store i8 [[TMP0]], ptr %a, align 1
15481 // CHECK:   ret void
15482 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
15483   vst1_lane_p8(a, b, 7);
15484 }
15485
15486 // CHECK-LABEL: @test_vst1_lane_p16(
15487 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15488 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15489 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15490 // CHECK:   store i16 [[TMP3]], ptr %a, align 2
15491 // CHECK:   ret void
15492 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
15493   vst1_lane_p16(a, b, 3);
15494 }
15495
15496 // CHECK-LABEL: @test_vst2q_u8(
15497 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
15498 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
15499 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
15500 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15501 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15502 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
15503 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
15504 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
15505 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
15506 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
15507 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
15508 // CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
15509 // CHECK:   ret void
15510 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
15511   vst2q_u8(a, b);
15512 }
15513
15514 // CHECK-LABEL: @test_vst2q_u16(
15515 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
15516 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
15517 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
15518 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15519 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15520 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
15521 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
15522 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
15523 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15524 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
15525 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
15526 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
15527 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15528 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15529 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15530 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
15531 // CHECK:   ret void
15532 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
15533   vst2q_u16(a, b);
15534 }
15535
15536 // CHECK-LABEL: @test_vst2q_u32(
15537 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
15538 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
15539 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
15540 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15541 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15542 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
15543 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
15544 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
15545 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
15546 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
15547 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
15548 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
15549 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
15550 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
15551 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
15552 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
15553 // CHECK:   ret void
15554 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
15555   vst2q_u32(a, b);
15556 }
15557
15558 // CHECK-LABEL: @test_vst2q_s8(
15559 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
15560 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
15561 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
15562 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15563 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15564 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
15565 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
15566 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
15567 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
15568 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
15569 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
15570 // CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
15571 // CHECK:   ret void
15572 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
15573   vst2q_s8(a, b);
15574 }
15575
15576 // CHECK-LABEL: @test_vst2q_s16(
15577 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
15578 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
15579 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
15580 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15581 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15582 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
15583 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
15584 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
15585 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15586 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
15587 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
15588 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
15589 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15590 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15591 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15592 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
15593 // CHECK:   ret void
15594 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
15595   vst2q_s16(a, b);
15596 }
15597
15598 // CHECK-LABEL: @test_vst2q_s32(
15599 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
15600 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
15601 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
15602 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15603 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15604 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
15605 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
15606 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
15607 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
15608 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
15609 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
15610 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
15611 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
15612 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
15613 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
15614 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
15615 // CHECK:   ret void
15616 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
15617   vst2q_s32(a, b);
15618 }
15619
15620 // CHECK-LABEL: @test_vst2q_f16(
15621 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
15622 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
15623 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
15624 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15625 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15626 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
15627 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
15628 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
15629 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
15630 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
15631 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
15632 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
15633 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
15634 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
15635 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
15636 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2)
15637 // CHECK:   ret void
15638 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
15639   vst2q_f16(a, b);
15640 }
15641
15642 // CHECK-LABEL: @test_vst2q_f32(
15643 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
15644 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
15645 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
15646 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15647 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15648 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
15649 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
15650 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
15651 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
15652 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
15653 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
15654 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
15655 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
15656 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
15657 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
15658 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
15659 // CHECK:   ret void
15660 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
15661   vst2q_f32(a, b);
15662 }
15663
15664 // CHECK-LABEL: @test_vst2q_p8(
15665 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
15666 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
15667 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
15668 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15669 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15670 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
15671 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
15672 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
15673 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
15674 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
15675 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
15676 // CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
15677 // CHECK:   ret void
15678 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
15679   vst2q_p8(a, b);
15680 }
15681
15682 // CHECK-LABEL: @test_vst2q_p16(
15683 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
15684 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
15685 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
15686 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15687 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15688 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
15689 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
15690 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
15691 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15692 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
15693 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
15694 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
15695 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15696 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15697 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15698 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
15699 // CHECK:   ret void
15700 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
15701   vst2q_p16(a, b);
15702 }
15703
15704 // CHECK-LABEL: @test_vst2_u8(
15705 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
15706 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
15707 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
15708 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15709 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15710 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
15711 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
15712 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
15713 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
15714 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
15715 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
15716 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
15717 // CHECK:   ret void
15718 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
15719   vst2_u8(a, b);
15720 }
15721
15722 // CHECK-LABEL: @test_vst2_u16(
15723 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
15724 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
15725 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
15726 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15727 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15728 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
15729 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
15730 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
15731 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
15732 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
15733 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
15734 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
15735 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
15736 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
15737 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
15738 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
15739 // CHECK:   ret void
15740 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
15741   vst2_u16(a, b);
15742 }
15743
15744 // CHECK-LABEL: @test_vst2_u32(
15745 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
15746 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
15747 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
15748 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15749 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15750 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
15751 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
15752 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
15753 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
15754 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
15755 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
15756 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
15757 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
15758 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
15759 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
15760 // CHECK:   call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
15761 // CHECK:   ret void
15762 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
15763   vst2_u32(a, b);
15764 }
15765
15766 // CHECK-LABEL: @test_vst2_u64(
15767 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
15768 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
15769 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
15770 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15771 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15772 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
15773 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
15774 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
15775 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
15776 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
15777 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
15778 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
15779 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
15780 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
15781 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
15782 // CHECK:   call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
15783 // CHECK:   ret void
15784 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
15785   vst2_u64(a, b);
15786 }
15787
15788 // CHECK-LABEL: @test_vst2_s8(
15789 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
15790 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
15791 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
15792 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15793 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15794 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
15795 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
15796 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
15797 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
15798 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
15799 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
15800 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
15801 // CHECK:   ret void
15802 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
15803   vst2_s8(a, b);
15804 }
15805
15806 // CHECK-LABEL: @test_vst2_s16(
15807 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
15808 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
15809 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
15810 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15811 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15812 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
15813 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
15814 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
15815 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
15816 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
15817 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
15818 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
15819 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
15820 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
15821 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
15822 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
15823 // CHECK:   ret void
15824 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
15825   vst2_s16(a, b);
15826 }
15827
15828 // CHECK-LABEL: @test_vst2_s32(
15829 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
15830 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
15831 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
15832 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15833 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15834 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
15835 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
15836 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
15837 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
15838 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
15839 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
15840 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
15841 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
15842 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
15843 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
15844 // CHECK:   call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
15845 // CHECK:   ret void
15846 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
15847   vst2_s32(a, b);
15848 }
15849
15850 // CHECK-LABEL: @test_vst2_s64(
15851 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
15852 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
15853 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
15854 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15855 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15856 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
15857 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
15858 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
15859 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
15860 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
15861 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
15862 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
15863 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
15864 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
15865 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
15866 // CHECK:   call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
15867 // CHECK:   ret void
15868 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
15869   vst2_s64(a, b);
15870 }
15871
15872 // CHECK-LABEL: @test_vst2_f16(
15873 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
15874 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
15875 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
15876 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15877 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15878 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
15879 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
15880 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
15881 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
15882 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
15883 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
15884 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
15885 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
15886 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
15887 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
15888 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2)
15889 // CHECK:   ret void
15890 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
15891   vst2_f16(a, b);
15892 }
15893
15894 // CHECK-LABEL: @test_vst2_f32(
15895 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
15896 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
15897 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
15898 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15899 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15900 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
15901 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
15902 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
15903 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
15904 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
15905 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
15906 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
15907 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
15908 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
15909 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
15910 // CHECK:   call void @llvm.arm.neon.vst2.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
15911 // CHECK:   ret void
15912 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
15913   vst2_f32(a, b);
15914 }
15915
15916 // CHECK-LABEL: @test_vst2_p8(
15917 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
15918 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
15919 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
15920 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15921 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15922 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
15923 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
15924 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
15925 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
15926 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
15927 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
15928 // CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
15929 // CHECK:   ret void
15930 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
15931   vst2_p8(a, b);
15932 }
15933
15934 // CHECK-LABEL: @test_vst2_p16(
15935 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
15936 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
15937 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
15938 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
15939 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
15940 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
15941 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
15942 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
15943 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
15944 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
15945 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
15946 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
15947 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
15948 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
15949 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
15950 // CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
15951 // CHECK:   ret void
15952 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
15953   vst2_p16(a, b);
15954 }
15955
15956 // CHECK-LABEL: @test_vst2q_lane_u16(
15957 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
15958 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
15959 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
15960 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15961 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15962 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
15963 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
15964 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
15965 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15966 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
15967 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
15968 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
15969 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15970 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15971 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15972 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
15973 // CHECK:   ret void
15974 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
15975   vst2q_lane_u16(a, b, 7);
15976 }
15977
15978 // CHECK-LABEL: @test_vst2q_lane_u32(
15979 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
15980 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
15981 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
15982 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
15983 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
15984 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
15985 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
15986 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
15987 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
15988 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
15989 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
15990 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
15991 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
15992 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
15993 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
15994 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
15995 // CHECK:   ret void
15996 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
15997   vst2q_lane_u32(a, b, 3);
15998 }
15999
16000 // CHECK-LABEL: @test_vst2q_lane_s16(
16001 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
16002 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
16003 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
16004 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16005 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
16006 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
16007 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16008 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16009 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16010 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
16011 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16012 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16013 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16014 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16015 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16016 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
16017 // CHECK:   ret void
16018 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
16019   vst2q_lane_s16(a, b, 7);
16020 }
16021
16022 // CHECK-LABEL: @test_vst2q_lane_s32(
16023 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
16024 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
16025 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
16026 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16027 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
16028 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
16029 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
16030 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
16031 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16032 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
16033 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
16034 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
16035 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16036 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16037 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16038 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
16039 // CHECK:   ret void
16040 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
16041   vst2q_lane_s32(a, b, 3);
16042 }
16043
16044 // CHECK-LABEL: @test_vst2q_lane_f16(
16045 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
16046 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
16047 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
16048 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16049 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
16050 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
16051 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
16052 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
16053 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
16054 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
16055 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
16056 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
16057 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
16058 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
16059 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
16060 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2)
16061 // CHECK:   ret void
16062 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
16063   vst2q_lane_f16(a, b, 7);
16064 }
16065
16066 // CHECK-LABEL: @test_vst2q_lane_f32(
16067 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
16068 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
16069 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
16070 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16071 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
16072 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
16073 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
16074 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
16075 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
16076 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
16077 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
16078 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
16079 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
16080 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
16081 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
16082 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
16083 // CHECK:   ret void
16084 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
16085   vst2q_lane_f32(a, b, 3);
16086 }
16087
16088 // CHECK-LABEL: @test_vst2q_lane_p16(
16089 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
16090 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
16091 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
16092 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16093 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
16094 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
16095 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16096 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16097 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16098 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
16099 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16100 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16101 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16102 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16103 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16104 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
16105 // CHECK:   ret void
16106 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
16107   vst2q_lane_p16(a, b, 7);
16108 }
16109
16110 // CHECK-LABEL: @test_vst2_lane_u8(
16111 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
16112 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
16113 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
16114 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16115 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16116 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
16117 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16118 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16119 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
16120 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16121 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16122 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16123 // CHECK:   ret void
16124 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
16125   vst2_lane_u8(a, b, 7);
16126 }
16127
16128 // CHECK-LABEL: @test_vst2_lane_u16(
16129 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
16130 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
16131 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
16132 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16133 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16134 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
16135 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16136 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16137 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16138 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
16139 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16140 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16141 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16142 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16143 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16144 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16145 // CHECK:   ret void
16146 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
16147   vst2_lane_u16(a, b, 3);
16148 }
16149
16150 // CHECK-LABEL: @test_vst2_lane_u32(
16151 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
16152 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
16153 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
16154 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16155 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16156 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
16157 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
16158 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
16159 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16160 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
16161 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
16162 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
16163 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16164 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16165 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16166 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
16167 // CHECK:   ret void
16168 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
16169   vst2_lane_u32(a, b, 1);
16170 }
16171
16172 // CHECK-LABEL: @test_vst2_lane_s8(
16173 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
16174 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
16175 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
16176 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16177 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16178 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
16179 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16180 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16181 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
16182 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16183 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16184 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16185 // CHECK:   ret void
16186 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
16187   vst2_lane_s8(a, b, 7);
16188 }
16189
16190 // CHECK-LABEL: @test_vst2_lane_s16(
16191 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
16192 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
16193 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
16194 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16195 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16196 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
16197 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16198 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16199 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16200 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
16201 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16202 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16203 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16204 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16205 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16206 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16207 // CHECK:   ret void
16208 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
16209   vst2_lane_s16(a, b, 3);
16210 }
16211
16212 // CHECK-LABEL: @test_vst2_lane_s32(
16213 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
16214 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
16215 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
16216 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16217 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16218 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
16219 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
16220 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
16221 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16222 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
16223 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
16224 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
16225 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16226 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16227 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16228 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
16229 // CHECK:   ret void
16230 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
16231   vst2_lane_s32(a, b, 1);
16232 }
16233
16234 // CHECK-LABEL: @test_vst2_lane_f16(
16235 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
16236 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
16237 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
16238 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16239 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16240 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
16241 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
16242 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
16243 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
16244 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
16245 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
16246 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
16247 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
16248 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
16249 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
16250 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2)
16251 // CHECK:   ret void
16252 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
16253   vst2_lane_f16(a, b, 3);
16254 }
16255
16256 // CHECK-LABEL: @test_vst2_lane_f32(
16257 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
16258 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
16259 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
16260 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16261 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16262 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
16263 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
16264 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
16265 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
16266 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
16267 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
16268 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
16269 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
16270 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
16271 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
16272 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
16273 // CHECK:   ret void
16274 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
16275   vst2_lane_f32(a, b, 1);
16276 }
16277
16278 // CHECK-LABEL: @test_vst2_lane_p8(
16279 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
16280 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
16281 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
16282 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16283 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16284 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
16285 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16286 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16287 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
16288 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16289 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16290 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16291 // CHECK:   ret void
16292 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
16293   vst2_lane_p8(a, b, 7);
16294 }
16295
16296 // CHECK-LABEL: @test_vst2_lane_p16(
16297 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
16298 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
16299 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
16300 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16301 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
16302 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
16303 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16304 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16305 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16306 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
16307 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16308 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16309 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16310 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16311 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16312 // CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16313 // CHECK:   ret void
16314 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
16315   vst2_lane_p16(a, b, 3);
16316 }
16317
16318 // CHECK-LABEL: @test_vst3q_u8(
16319 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
16320 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
16321 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
16322 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16323 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16324 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
16325 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
16326 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
16327 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
16328 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
16329 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
16330 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
16331 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
16332 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
16333 // CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
16334 // CHECK:   ret void
16335 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
16336   vst3q_u8(a, b);
16337 }
16338
16339 // CHECK-LABEL: @test_vst3q_u16(
16340 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
16341 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
16342 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
16343 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16344 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16345 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16346 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16347 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16348 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16349 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16350 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16351 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16352 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16353 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16354 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
16355 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
16356 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16357 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16358 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16359 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16360 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
16361 // CHECK:   ret void
16362 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
16363   vst3q_u16(a, b);
16364 }
16365
16366 // CHECK-LABEL: @test_vst3q_u32(
16367 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
16368 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
16369 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
16370 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16371 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16372 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16373 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
16374 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
16375 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16376 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16377 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
16378 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
16379 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16380 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16381 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
16382 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
16383 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
16384 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16385 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16386 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
16387 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
16388 // CHECK:   ret void
16389 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
16390   vst3q_u32(a, b);
16391 }
16392
16393 // CHECK-LABEL: @test_vst3q_s8(
16394 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
16395 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
16396 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
16397 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16398 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16399 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
16400 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
16401 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
16402 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
16403 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
16404 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
16405 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
16406 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
16407 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
16408 // CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
16409 // CHECK:   ret void
16410 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
16411   vst3q_s8(a, b);
16412 }
16413
16414 // CHECK-LABEL: @test_vst3q_s16(
16415 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
16416 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
16417 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
16418 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16419 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16420 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16421 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16422 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16423 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16424 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16425 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16426 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16427 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16428 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16429 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
16430 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
16431 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16432 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16433 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16434 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16435 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
16436 // CHECK:   ret void
16437 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
16438   vst3q_s16(a, b);
16439 }
16440
16441 // CHECK-LABEL: @test_vst3q_s32(
16442 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
16443 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
16444 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
16445 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16446 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16447 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16448 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
16449 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
16450 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16451 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16452 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
16453 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
16454 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16455 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16456 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
16457 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
16458 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
16459 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16460 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16461 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
16462 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
16463 // CHECK:   ret void
16464 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
16465   vst3q_s32(a, b);
16466 }
16467
16468 // CHECK-LABEL: @test_vst3q_f16(
16469 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
16470 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
16471 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
16472 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16473 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16474 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16475 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
16476 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
16477 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
16478 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16479 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
16480 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
16481 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
16482 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16483 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
16484 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
16485 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
16486 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
16487 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
16488 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
16489 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2)
16490 // CHECK:   ret void
16491 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
16492   vst3q_f16(a, b);
16493 }
16494
16495 // CHECK-LABEL: @test_vst3q_f32(
16496 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
16497 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
16498 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
16499 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16500 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16501 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
16502 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
16503 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
16504 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
16505 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
16506 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
16507 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
16508 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
16509 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
16510 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
16511 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
16512 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
16513 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
16514 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
16515 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
16516 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
16517 // CHECK:   ret void
16518 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
16519   vst3q_f32(a, b);
16520 }
16521
16522 // CHECK-LABEL: @test_vst3q_p8(
16523 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
16524 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
16525 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
16526 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16527 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16528 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
16529 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
16530 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
16531 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
16532 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
16533 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
16534 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
16535 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
16536 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
16537 // CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
16538 // CHECK:   ret void
16539 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
16540   vst3q_p8(a, b);
16541 }
16542
16543 // CHECK-LABEL: @test_vst3q_p16(
16544 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
16545 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
16546 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
16547 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16548 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16549 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
16550 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16551 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16552 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16553 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
16554 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16555 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16556 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16557 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
16558 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
16559 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
16560 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16561 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16562 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16563 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16564 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
16565 // CHECK:   ret void
16566 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
16567   vst3q_p16(a, b);
16568 }
16569
16570 // CHECK-LABEL: @test_vst3_u8(
16571 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
16572 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
16573 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
16574 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16575 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16576 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
16577 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16578 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16579 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
16580 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16581 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16582 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
16583 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
16584 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
16585 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
16586 // CHECK:   ret void
16587 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
16588   vst3_u8(a, b);
16589 }
16590
16591 // CHECK-LABEL: @test_vst3_u16(
16592 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
16593 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
16594 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
16595 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16596 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16597 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
16598 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16599 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16600 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16601 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
16602 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16603 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16604 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16605 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
16606 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
16607 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
16608 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
16609 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16610 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16611 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
16612 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
16613 // CHECK:   ret void
16614 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
16615   vst3_u16(a, b);
16616 }
16617
16618 // CHECK-LABEL: @test_vst3_u32(
16619 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
16620 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
16621 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
16622 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16623 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16624 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
16625 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
16626 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
16627 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16628 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
16629 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
16630 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
16631 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16632 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
16633 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
16634 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
16635 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
16636 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16637 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16638 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
16639 // CHECK:   call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
16640 // CHECK:   ret void
16641 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
16642   vst3_u32(a, b);
16643 }
16644
16645 // CHECK-LABEL: @test_vst3_u64(
16646 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
16647 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
16648 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
16649 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16650 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16651 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
16652 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
16653 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
16654 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
16655 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
16656 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
16657 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
16658 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
16659 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
16660 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
16661 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
16662 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
16663 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
16664 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
16665 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
16666 // CHECK:   call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
16667 // CHECK:   ret void
16668 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
16669   vst3_u64(a, b);
16670 }
16671
16672 // CHECK-LABEL: @test_vst3_s8(
16673 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
16674 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
16675 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
16676 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16677 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16678 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
16679 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16680 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16681 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
16682 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16683 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16684 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
16685 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
16686 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
16687 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
16688 // CHECK:   ret void
16689 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
16690   vst3_s8(a, b);
16691 }
16692
16693 // CHECK-LABEL: @test_vst3_s16(
16694 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
16695 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
16696 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
16697 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16698 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16699 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
16700 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16701 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16702 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16703 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
16704 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16705 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16706 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16707 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
16708 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
16709 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
16710 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
16711 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16712 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16713 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
16714 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
16715 // CHECK:   ret void
16716 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
16717   vst3_s16(a, b);
16718 }
16719
16720 // CHECK-LABEL: @test_vst3_s32(
16721 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
16722 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
16723 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
16724 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16725 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16726 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
16727 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
16728 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
16729 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16730 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
16731 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
16732 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
16733 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16734 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
16735 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
16736 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
16737 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
16738 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16739 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16740 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
16741 // CHECK:   call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
16742 // CHECK:   ret void
16743 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
16744   vst3_s32(a, b);
16745 }
16746
16747 // CHECK-LABEL: @test_vst3_s64(
16748 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
16749 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
16750 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
16751 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16752 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16753 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
16754 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
16755 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
16756 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
16757 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
16758 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
16759 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
16760 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
16761 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
16762 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
16763 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
16764 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
16765 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
16766 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
16767 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
16768 // CHECK:   call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
16769 // CHECK:   ret void
16770 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
16771   vst3_s64(a, b);
16772 }
16773
16774 // CHECK-LABEL: @test_vst3_f16(
16775 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
16776 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
16777 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
16778 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16779 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16780 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
16781 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
16782 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
16783 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
16784 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
16785 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
16786 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
16787 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
16788 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
16789 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
16790 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
16791 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
16792 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
16793 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
16794 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
16795 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2)
16796 // CHECK:   ret void
16797 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
16798   vst3_f16(a, b);
16799 }
16800
16801 // CHECK-LABEL: @test_vst3_f32(
16802 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
16803 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
16804 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
16805 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16806 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16807 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
16808 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
16809 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
16810 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
16811 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
16812 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
16813 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
16814 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
16815 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
16816 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
16817 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
16818 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
16819 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
16820 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
16821 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
16822 // CHECK:   call void @llvm.arm.neon.vst3.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
16823 // CHECK:   ret void
16824 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
16825   vst3_f32(a, b);
16826 }
16827
16828 // CHECK-LABEL: @test_vst3_p8(
16829 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
16830 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
16831 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
16832 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16833 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16834 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
16835 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
16836 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
16837 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
16838 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
16839 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
16840 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
16841 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
16842 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
16843 // CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
16844 // CHECK:   ret void
16845 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
16846   vst3_p8(a, b);
16847 }
16848
16849 // CHECK-LABEL: @test_vst3_p16(
16850 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
16851 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
16852 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
16853 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
16854 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
16855 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
16856 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
16857 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
16858 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16859 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
16860 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
16861 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
16862 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16863 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
16864 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
16865 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
16866 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
16867 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16868 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16869 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
16870 // CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
16871 // CHECK:   ret void
16872 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
16873   vst3_p16(a, b);
16874 }
16875
16876 // CHECK-LABEL: @test_vst3q_lane_u16(
16877 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
16878 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
16879 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
16880 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16881 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16882 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16883 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16884 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16885 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16886 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16887 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16888 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16889 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16890 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
16891 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
16892 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
16893 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16894 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16895 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16896 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16897 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
16898 // CHECK:   ret void
16899 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
16900   vst3q_lane_u16(a, b, 7);
16901 }
16902
16903 // CHECK-LABEL: @test_vst3q_lane_u32(
16904 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
16905 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
16906 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
16907 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16908 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16909 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16910 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
16911 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
16912 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16913 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16914 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
16915 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
16916 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16917 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
16918 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
16919 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
16920 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
16921 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16922 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16923 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
16924 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
16925 // CHECK:   ret void
16926 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
16927   vst3q_lane_u32(a, b, 3);
16928 }
16929
16930 // CHECK-LABEL: @test_vst3q_lane_s16(
16931 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
16932 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
16933 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
16934 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16935 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16936 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16937 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
16938 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
16939 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16940 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16941 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
16942 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
16943 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16944 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
16945 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
16946 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
16947 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16948 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16949 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16950 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16951 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
16952 // CHECK:   ret void
16953 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
16954   vst3q_lane_s16(a, b, 7);
16955 }
16956
16957 // CHECK-LABEL: @test_vst3q_lane_s32(
16958 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
16959 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
16960 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
16961 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16962 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16963 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16964 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
16965 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
16966 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16967 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16968 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
16969 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
16970 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16971 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
16972 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
16973 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
16974 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
16975 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16976 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16977 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
16978 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
16979 // CHECK:   ret void
16980 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
16981   vst3q_lane_s32(a, b, 3);
16982 }
16983
16984 // CHECK-LABEL: @test_vst3q_lane_f16(
16985 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
16986 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
16987 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
16988 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
16989 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
16990 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16991 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
16992 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
16993 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
16994 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16995 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
16996 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
16997 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
16998 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
16999 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
17000 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
17001 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
17002 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
17003 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
17004 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
17005 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2)
17006 // CHECK:   ret void
17007 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
17008   vst3q_lane_f16(a, b, 7);
17009 }
17010
17011 // CHECK-LABEL: @test_vst3q_lane_f32(
17012 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
17013 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
17014 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
17015 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17016 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
17017 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
17018 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
17019 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
17020 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
17021 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
17022 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
17023 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
17024 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
17025 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
17026 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
17027 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
17028 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
17029 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
17030 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
17031 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
17032 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
17033 // CHECK:   ret void
17034 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
17035   vst3q_lane_f32(a, b, 3);
17036 }
17037
17038 // CHECK-LABEL: @test_vst3q_lane_p16(
17039 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
17040 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
17041 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
17042 // CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17043 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
17044 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
17045 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
17046 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
17047 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17048 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
17049 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
17050 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
17051 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17052 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
17053 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
17054 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
17055 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17056 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17057 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17058 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17059 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
17060 // CHECK:   ret void
17061 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
17062   vst3q_lane_p16(a, b, 7);
17063 }
17064
17065 // CHECK-LABEL: @test_vst3_lane_u8(
17066 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
17067 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
17068 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
17069 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17070 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17071 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
17072 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17073 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17074 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
17075 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17076 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17077 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
17078 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17079 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17080 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17081 // CHECK:   ret void
17082 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
17083   vst3_lane_u8(a, b, 7);
17084 }
17085
17086 // CHECK-LABEL: @test_vst3_lane_u16(
17087 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
17088 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
17089 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
17090 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17091 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17092 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
17093 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17094 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17095 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17096 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
17097 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17098 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17099 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17100 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
17101 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17102 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17103 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17104 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17105 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17106 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17107 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17108 // CHECK:   ret void
17109 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
17110   vst3_lane_u16(a, b, 3);
17111 }
17112
17113 // CHECK-LABEL: @test_vst3_lane_u32(
17114 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
17115 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
17116 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
17117 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17118 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17119 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
17120 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
17121 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
17122 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17123 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
17124 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
17125 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
17126 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17127 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
17128 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
17129 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
17130 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17131 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17132 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17133 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17134 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
17135 // CHECK:   ret void
17136 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
17137   vst3_lane_u32(a, b, 1);
17138 }
17139
17140 // CHECK-LABEL: @test_vst3_lane_s8(
17141 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
17142 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
17143 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
17144 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17145 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17146 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
17147 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17148 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17149 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
17150 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17151 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17152 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
17153 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17154 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17155 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17156 // CHECK:   ret void
17157 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
17158   vst3_lane_s8(a, b, 7);
17159 }
17160
17161 // CHECK-LABEL: @test_vst3_lane_s16(
17162 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
17163 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
17164 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
17165 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17166 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17167 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
17168 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17169 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17170 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17171 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
17172 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17173 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17174 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17175 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
17176 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17177 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17178 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17179 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17180 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17181 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17182 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17183 // CHECK:   ret void
17184 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
17185   vst3_lane_s16(a, b, 3);
17186 }
17187
17188 // CHECK-LABEL: @test_vst3_lane_s32(
17189 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
17190 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
17191 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
17192 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17193 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17194 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
17195 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
17196 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
17197 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17198 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
17199 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
17200 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
17201 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17202 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
17203 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
17204 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
17205 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17206 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17207 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17208 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17209 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
17210 // CHECK:   ret void
17211 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
17212   vst3_lane_s32(a, b, 1);
17213 }
17214
17215 // CHECK-LABEL: @test_vst3_lane_f16(
17216 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
17217 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
17218 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
17219 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17220 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17221 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
17222 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
17223 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
17224 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
17225 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
17226 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
17227 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
17228 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
17229 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
17230 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
17231 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
17232 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
17233 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
17234 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
17235 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
17236 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2)
17237 // CHECK:   ret void
17238 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
17239   vst3_lane_f16(a, b, 3);
17240 }
17241
17242 // CHECK-LABEL: @test_vst3_lane_f32(
17243 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
17244 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
17245 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
17246 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17247 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17248 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
17249 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
17250 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
17251 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
17252 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
17253 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
17254 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
17255 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
17256 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
17257 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
17258 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
17259 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
17260 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
17261 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
17262 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
17263 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
17264 // CHECK:   ret void
17265 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
17266   vst3_lane_f32(a, b, 1);
17267 }
17268
17269 // CHECK-LABEL: @test_vst3_lane_p8(
17270 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
17271 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
17272 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
17273 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17274 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17275 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
17276 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17277 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17278 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
17279 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17280 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17281 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
17282 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17283 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17284 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17285 // CHECK:   ret void
17286 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
17287   vst3_lane_p8(a, b, 7);
17288 }
17289
17290 // CHECK-LABEL: @test_vst3_lane_p16(
17291 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
17292 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
17293 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
17294 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17295 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
17296 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
17297 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17298 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17299 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17300 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
17301 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17302 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17303 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17304 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
17305 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17306 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17307 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17308 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17309 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17310 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17311 // CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17312 // CHECK:   ret void
17313 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
17314   vst3_lane_p16(a, b, 3);
17315 }
17316
17317 // CHECK-LABEL: @test_vst4q_u8(
17318 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
17319 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
17320 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
17321 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17322 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17323 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
17324 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
17325 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
17326 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
17327 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
17328 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
17329 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
17330 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
17331 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
17332 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
17333 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
17334 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
17335 // CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
17336 // CHECK:   ret void
17337 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
17338   vst4q_u8(a, b);
17339 }
17340
17341 // CHECK-LABEL: @test_vst4q_u16(
17342 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
17343 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
17344 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
17345 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17346 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17347 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17348 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
17349 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
17350 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17351 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17352 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
17353 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
17354 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17355 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17356 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
17357 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
17358 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17359 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17360 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
17361 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
17362 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
17363 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17364 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17365 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17366 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
17367 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
17368 // CHECK:   ret void
17369 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
17370   vst4q_u16(a, b);
17371 }
17372
17373 // CHECK-LABEL: @test_vst4q_u32(
17374 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
17375 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
17376 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
17377 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17378 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17379 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
17380 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
17381 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
17382 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
17383 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
17384 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
17385 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
17386 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
17387 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
17388 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
17389 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
17390 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
17391 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
17392 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
17393 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
17394 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
17395 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
17396 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
17397 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
17398 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
17399 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
17400 // CHECK:   ret void
17401 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
17402   vst4q_u32(a, b);
17403 }
17404
17405 // CHECK-LABEL: @test_vst4q_s8(
17406 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
17407 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
17408 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
17409 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17410 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17411 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
17412 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
17413 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
17414 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
17415 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
17416 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
17417 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
17418 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
17419 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
17420 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
17421 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
17422 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
17423 // CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
17424 // CHECK:   ret void
17425 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
17426   vst4q_s8(a, b);
17427 }
17428
17429 // CHECK-LABEL: @test_vst4q_s16(
17430 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
17431 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
17432 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
17433 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17434 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17435 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
17436 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
17437 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
17438 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17439 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
17440 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
17441 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
17442 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17443 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
17444 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
17445 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
17446 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17447 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
17448 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
17449 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
17450 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
17451 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17452 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17453 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17454 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
17455 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
17456 // CHECK:   ret void
17457 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
17458   vst4q_s16(a, b);
17459 }
17460
17461 // CHECK-LABEL: @test_vst4q_s32(
17462 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
17463 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
17464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
17465 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17466 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17467 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
17468 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
17469 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
17470 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
17471 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
17472 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
17473 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
17474 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
17475 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
17476 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
17477 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
17478 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
17479 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
17480 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
17481 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
17482 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
17483 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
17484 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
17485 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
17486 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
17487 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
17488 // CHECK:   ret void
17489 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
17490   vst4q_s32(a, b);
17491 }
17492
17493 // CHECK-LABEL: @test_vst4q_f16(
17494 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
17495 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
17496 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
17497 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17498 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17499 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
17500 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
17501 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
17502 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
17503 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
17504 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
17505 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
17506 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
17507 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
17508 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
17509 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
17510 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
17511 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
17512 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
17513 // CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
17514 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
17515 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
17516 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
17517 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
17518 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
17519 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2)
17520 // CHECK:   ret void
17521 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
17522   vst4q_f16(a, b);
17523 }
17524
17525 // CHECK-LABEL: @test_vst4q_f32(
17526 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
17527 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
17528 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
17529 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17530 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17531 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
17532 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
17533 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
17534 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
17535 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
17536 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
17537 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
17538 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
17539 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
17540 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
17541 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
17542 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
17543 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
17544 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
17545 // CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
17546 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
17547 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
17548 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
17549 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
17550 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
17551 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
17552 // CHECK:   ret void
17553 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
17554   vst4q_f32(a, b);
17555 }
17556
17557 // CHECK-LABEL: @test_vst4q_p8(
17558 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
17559 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
17560 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
17561 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17562 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17563 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
17564 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
17565 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
17566 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
17567 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
17568 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
17569 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
17570 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
17571 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
17572 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
17573 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
17574 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
17575 // CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
17576 // CHECK:   ret void
17577 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
17578   vst4q_p8(a, b);
17579 }
17580
17581 // CHECK-LABEL: @test_vst4q_p16(
17582 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
17583 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
17584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
17585 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17586 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17587 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
17588 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
17589 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
17590 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17591 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
17592 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
17593 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
17594 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17595 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
17596 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
17597 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
17598 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17599 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
17600 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
17601 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
17602 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
17603 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17604 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17605 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17606 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
17607 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
17608 // CHECK:   ret void
17609 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
17610   vst4q_p16(a, b);
17611 }
17612
17613 // CHECK-LABEL: @test_vst4_u8(
17614 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
17615 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
17616 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
17617 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17618 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17619 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
17620 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17621 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17622 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
17623 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17624 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17625 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
17626 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17627 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17628 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
17629 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
17630 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
17631 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
17632 // CHECK:   ret void
17633 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
17634   vst4_u8(a, b);
17635 }
17636
17637 // CHECK-LABEL: @test_vst4_u16(
17638 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
17639 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
17640 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
17641 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17642 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17643 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
17644 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17645 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17646 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17647 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
17648 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17649 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17650 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17651 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
17652 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17653 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17654 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17655 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
17656 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
17657 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
17658 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
17659 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17660 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17661 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17662 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
17663 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
17664 // CHECK:   ret void
17665 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
17666   vst4_u16(a, b);
17667 }
17668
17669 // CHECK-LABEL: @test_vst4_u32(
17670 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
17671 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
17672 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
17673 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17674 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17675 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
17676 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
17677 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
17678 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17679 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
17680 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
17681 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
17682 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17683 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
17684 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
17685 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
17686 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17687 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
17688 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
17689 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
17690 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
17691 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17692 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17693 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17694 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
17695 // CHECK:   call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
17696 // CHECK:   ret void
17697 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
17698   vst4_u32(a, b);
17699 }
17700
17701 // CHECK-LABEL: @test_vst4_u64(
17702 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
17703 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
17704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
17705 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17706 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17707 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
17708 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
17709 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
17710 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
17711 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
17712 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
17713 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
17714 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
17715 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
17716 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
17717 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
17718 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
17719 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
17720 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
17721 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
17722 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
17723 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
17724 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
17725 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
17726 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
17727 // CHECK:   call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
17728 // CHECK:   ret void
17729 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
17730   vst4_u64(a, b);
17731 }
17732
17733 // CHECK-LABEL: @test_vst4_s8(
17734 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
17735 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
17736 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
17737 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17738 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17739 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
17740 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17741 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17742 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
17743 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17744 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17745 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
17746 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17747 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17748 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
17749 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
17750 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
17751 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
17752 // CHECK:   ret void
17753 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
17754   vst4_s8(a, b);
17755 }
17756
17757 // CHECK-LABEL: @test_vst4_s16(
17758 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
17759 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
17760 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
17761 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17762 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17763 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
17764 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17765 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17766 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17767 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
17768 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17769 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17770 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17771 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
17772 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17773 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17774 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17775 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
17776 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
17777 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
17778 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
17779 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17780 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17781 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17782 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
17783 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
17784 // CHECK:   ret void
17785 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
17786   vst4_s16(a, b);
17787 }
17788
17789 // CHECK-LABEL: @test_vst4_s32(
17790 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
17791 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
17792 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
17793 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17794 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17795 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
17796 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
17797 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
17798 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17799 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
17800 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
17801 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
17802 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17803 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
17804 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
17805 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
17806 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17807 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
17808 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
17809 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
17810 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
17811 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17812 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17813 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17814 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
17815 // CHECK:   call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
17816 // CHECK:   ret void
17817 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
17818   vst4_s32(a, b);
17819 }
17820
17821 // CHECK-LABEL: @test_vst4_s64(
17822 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
17823 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
17824 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
17825 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17826 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17827 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
17828 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
17829 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
17830 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
17831 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
17832 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
17833 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
17834 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
17835 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
17836 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
17837 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
17838 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
17839 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
17840 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
17841 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
17842 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
17843 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
17844 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
17845 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
17846 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
17847 // CHECK:   call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
17848 // CHECK:   ret void
17849 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
17850   vst4_s64(a, b);
17851 }
17852
17853 // CHECK-LABEL: @test_vst4_f16(
17854 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
17855 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
17856 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
17857 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17858 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17859 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
17860 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
17861 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
17862 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
17863 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
17864 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
17865 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
17866 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
17867 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
17868 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
17869 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
17870 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
17871 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
17872 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
17873 // CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
17874 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
17875 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
17876 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
17877 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
17878 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
17879 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2)
17880 // CHECK:   ret void
17881 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
17882   vst4_f16(a, b);
17883 }
17884
17885 // CHECK-LABEL: @test_vst4_f32(
17886 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
17887 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
17888 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
17889 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17890 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17891 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
17892 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
17893 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
17894 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
17895 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
17896 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
17897 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
17898 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
17899 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
17900 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
17901 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
17902 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
17903 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
17904 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
17905 // CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
17906 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
17907 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
17908 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
17909 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
17910 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
17911 // CHECK:   call void @llvm.arm.neon.vst4.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
17912 // CHECK:   ret void
17913 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
17914   vst4_f32(a, b);
17915 }
17916
17917 // CHECK-LABEL: @test_vst4_p8(
17918 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
17919 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
17920 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
17921 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17922 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17923 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
17924 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
17925 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
17926 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
17927 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
17928 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
17929 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
17930 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
17931 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
17932 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
17933 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
17934 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
17935 // CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
17936 // CHECK:   ret void
17937 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
17938   vst4_p8(a, b);
17939 }
17940
17941 // CHECK-LABEL: @test_vst4_p16(
17942 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
17943 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
17944 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
17945 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
17946 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
17947 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
17948 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
17949 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
17950 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17951 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
17952 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
17953 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
17954 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17955 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
17956 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
17957 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
17958 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17959 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
17960 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
17961 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
17962 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
17963 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17964 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17965 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17966 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
17967 // CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
17968 // CHECK:   ret void
17969 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
17970   vst4_p16(a, b);
17971 }
17972
17973 // CHECK-LABEL: @test_vst4q_lane_u16(
17974 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
17975 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
17976 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
17977 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
17978 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
17979 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17980 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
17981 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
17982 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17983 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17984 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
17985 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
17986 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17987 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17988 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
17989 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
17990 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17991 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
17992 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
17993 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
17994 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
17995 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17996 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17997 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17998 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
17999 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18000 // CHECK:   ret void
18001 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
18002   vst4q_lane_u16(a, b, 7);
18003 }
18004
18005 // CHECK-LABEL: @test_vst4q_lane_u32(
18006 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
18007 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
18008 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
18009 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18010 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18011 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
18012 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
18013 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
18014 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18015 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
18016 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
18017 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
18018 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18019 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
18020 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
18021 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
18022 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18023 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
18024 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
18025 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
18026 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18027 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18028 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18029 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18030 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18031 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
18032 // CHECK:   ret void
18033 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
18034   vst4q_lane_u32(a, b, 3);
18035 }
18036
18037 // CHECK-LABEL: @test_vst4q_lane_s16(
18038 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
18039 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
18040 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
18041 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18042 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18043 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
18044 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
18045 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
18046 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18047 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
18048 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
18049 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
18050 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18051 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
18052 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
18053 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
18054 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18055 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
18056 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
18057 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
18058 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18059 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18060 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18061 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18062 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18063 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18064 // CHECK:   ret void
18065 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
18066   vst4q_lane_s16(a, b, 7);
18067 }
18068
18069 // CHECK-LABEL: @test_vst4q_lane_s32(
18070 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
18071 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
18072 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
18073 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18074 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18075 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
18076 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
18077 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
18078 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18079 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
18080 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
18081 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
18082 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18083 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
18084 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
18085 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
18086 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18087 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
18088 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
18089 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
18090 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18091 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18092 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18093 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18094 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18095 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
18096 // CHECK:   ret void
18097 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
18098   vst4q_lane_s32(a, b, 3);
18099 }
18100
18101 // CHECK-LABEL: @test_vst4q_lane_f16(
18102 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
18103 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
18104 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
18105 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18106 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18107 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
18108 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
18109 // CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
18110 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
18111 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
18112 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
18113 // CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
18114 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
18115 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
18116 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
18117 // CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
18118 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
18119 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
18120 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
18121 // CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
18122 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
18123 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
18124 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
18125 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
18126 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
18127 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2)
18128 // CHECK:   ret void
18129 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
18130   vst4q_lane_f16(a, b, 7);
18131 }
18132
18133 // CHECK-LABEL: @test_vst4q_lane_f32(
18134 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
18135 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
18136 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
18137 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18138 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18139 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
18140 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
18141 // CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
18142 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
18143 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
18144 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
18145 // CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
18146 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
18147 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
18148 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
18149 // CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
18150 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
18151 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
18152 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
18153 // CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
18154 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
18155 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
18156 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
18157 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
18158 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
18159 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
18160 // CHECK:   ret void
18161 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
18162   vst4q_lane_f32(a, b, 3);
18163 }
18164
18165 // CHECK-LABEL: @test_vst4q_lane_p16(
18166 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
18167 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
18168 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
18169 // CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
18170 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
18171 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
18172 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
18173 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
18174 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18175 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
18176 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
18177 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
18178 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18179 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
18180 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
18181 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
18182 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18183 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
18184 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
18185 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
18186 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18187 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18188 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18189 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18190 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18191 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18192 // CHECK:   ret void
18193 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
18194   vst4q_lane_p16(a, b, 7);
18195 }
18196
18197 // CHECK-LABEL: @test_vst4_lane_u8(
18198 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
18199 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
18200 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
18201 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18202 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18203 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
18204 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
18205 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
18206 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
18207 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
18208 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
18209 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
18210 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
18211 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
18212 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
18213 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
18214 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
18215 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
18216 // CHECK:   ret void
18217 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
18218   vst4_lane_u8(a, b, 7);
18219 }
18220
18221 // CHECK-LABEL: @test_vst4_lane_u16(
18222 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
18223 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
18224 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
18225 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18226 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18227 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
18228 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
18229 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
18230 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18231 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
18232 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
18233 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
18234 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18235 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
18236 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
18237 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
18238 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18239 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
18240 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
18241 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
18242 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18243 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18244 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18245 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18246 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18247 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
18248 // CHECK:   ret void
18249 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
18250   vst4_lane_u16(a, b, 3);
18251 }
18252
18253 // CHECK-LABEL: @test_vst4_lane_u32(
18254 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
18255 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
18256 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
18257 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18258 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18259 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
18260 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
18261 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
18262 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18263 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
18264 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
18265 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
18266 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18267 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
18268 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
18269 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
18270 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
18271 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
18272 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
18273 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
18274 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
18275 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18276 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18277 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
18278 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
18279 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
18280 // CHECK:   ret void
18281 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
18282   vst4_lane_u32(a, b, 1);
18283 }
18284
18285 // CHECK-LABEL: @test_vst4_lane_s8(
18286 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
18287 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
18288 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
18289 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18290 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18291 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
18292 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
18293 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
18294 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
18295 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
18296 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
18297 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
18298 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
18299 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
18300 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
18301 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
18302 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
18303 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
18304 // CHECK:   ret void
18305 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
18306   vst4_lane_s8(a, b, 7);
18307 }
18308
18309 // CHECK-LABEL: @test_vst4_lane_s16(
18310 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
18311 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
18312 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
18313 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18314 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18315 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
18316 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
18317 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
18318 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18319 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
18320 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
18321 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
18322 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18323 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
18324 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
18325 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
18326 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18327 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
18328 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
18329 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
18330 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18331 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18332 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18333 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18334 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18335 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
18336 // CHECK:   ret void
18337 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
18338   vst4_lane_s16(a, b, 3);
18339 }
18340
18341 // CHECK-LABEL: @test_vst4_lane_s32(
18342 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
18343 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
18344 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
18345 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18346 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18347 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
18348 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
18349 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
18350 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18351 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
18352 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
18353 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
18354 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18355 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
18356 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
18357 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
18358 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
18359 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
18360 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
18361 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
18362 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
18363 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18364 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18365 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
18366 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
18367 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
18368 // CHECK:   ret void
18369 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
18370   vst4_lane_s32(a, b, 1);
18371 }
18372
18373 // CHECK-LABEL: @test_vst4_lane_f16(
18374 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
18375 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
18376 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
18377 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18378 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18379 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
18380 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
18381 // CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
18382 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
18383 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
18384 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
18385 // CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
18386 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
18387 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
18388 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
18389 // CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
18390 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
18391 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
18392 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
18393 // CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
18394 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
18395 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
18396 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
18397 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
18398 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
18399 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2)
18400 // CHECK:   ret void
18401 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
18402   vst4_lane_f16(a, b, 3);
18403 }
18404
18405 // CHECK-LABEL: @test_vst4_lane_f32(
18406 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
18407 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
18408 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
18409 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18410 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18411 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
18412 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
18413 // CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
18414 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
18415 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
18416 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
18417 // CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
18418 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
18419 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
18420 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
18421 // CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
18422 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
18423 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
18424 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
18425 // CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
18426 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
18427 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
18428 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
18429 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
18430 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
18431 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
18432 // CHECK:   ret void
18433 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
18434   vst4_lane_f32(a, b, 1);
18435 }
18436
18437 // CHECK-LABEL: @test_vst4_lane_p8(
18438 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
18439 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
18440 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
18441 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18442 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18443 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
18444 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
18445 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
18446 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
18447 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
18448 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
18449 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
18450 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
18451 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
18452 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
18453 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
18454 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
18455 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
18456 // CHECK:   ret void
18457 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
18458   vst4_lane_p8(a, b, 7);
18459 }
18460
18461 // CHECK-LABEL: @test_vst4_lane_p16(
18462 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
18463 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
18464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
18465 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
18466 // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
18467 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
18468 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
18469 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
18470 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18471 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
18472 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
18473 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
18474 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18475 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
18476 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
18477 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
18478 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18479 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
18480 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
18481 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
18482 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18483 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18484 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18485 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18486 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18487 // CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
18488 // CHECK:   ret void
18489 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
18490   vst4_lane_p16(a, b, 3);
18491 }
18492
18493 // CHECK-LABEL: @test_vsub_s8(
18494 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
18495 // CHECK:   ret <8 x i8> [[SUB_I]]
18496 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
18497   return vsub_s8(a, b);
18498 }
18499
18500 // CHECK-LABEL: @test_vsub_s16(
18501 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
18502 // CHECK:   ret <4 x i16> [[SUB_I]]
18503 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
18504   return vsub_s16(a, b);
18505 }
18506
18507 // CHECK-LABEL: @test_vsub_s32(
18508 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
18509 // CHECK:   ret <2 x i32> [[SUB_I]]
18510 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
18511   return vsub_s32(a, b);
18512 }
18513
18514 // CHECK-LABEL: @test_vsub_s64(
18515 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
18516 // CHECK:   ret <1 x i64> [[SUB_I]]
18517 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
18518   return vsub_s64(a, b);
18519 }
18520
18521 // CHECK-LABEL: @test_vsub_f32(
18522 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
18523 // CHECK:   ret <2 x float> [[SUB_I]]
18524 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
18525   return vsub_f32(a, b);
18526 }
18527
18528 // CHECK-LABEL: @test_vsub_u8(
18529 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
18530 // CHECK:   ret <8 x i8> [[SUB_I]]
18531 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
18532   return vsub_u8(a, b);
18533 }
18534
18535 // CHECK-LABEL: @test_vsub_u16(
18536 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
18537 // CHECK:   ret <4 x i16> [[SUB_I]]
18538 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
18539   return vsub_u16(a, b);
18540 }
18541
18542 // CHECK-LABEL: @test_vsub_u32(
18543 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
18544 // CHECK:   ret <2 x i32> [[SUB_I]]
18545 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
18546   return vsub_u32(a, b);
18547 }
18548
18549 // CHECK-LABEL: @test_vsub_u64(
18550 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
18551 // CHECK:   ret <1 x i64> [[SUB_I]]
18552 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
18553   return vsub_u64(a, b);
18554 }
18555
18556 // CHECK-LABEL: @test_vsubq_s8(
18557 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
18558 // CHECK:   ret <16 x i8> [[SUB_I]]
18559 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
18560   return vsubq_s8(a, b);
18561 }
18562
18563 // CHECK-LABEL: @test_vsubq_s16(
18564 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
18565 // CHECK:   ret <8 x i16> [[SUB_I]]
18566 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
18567   return vsubq_s16(a, b);
18568 }
18569
18570 // CHECK-LABEL: @test_vsubq_s32(
18571 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
18572 // CHECK:   ret <4 x i32> [[SUB_I]]
18573 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
18574   return vsubq_s32(a, b);
18575 }
18576
18577 // CHECK-LABEL: @test_vsubq_s64(
18578 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
18579 // CHECK:   ret <2 x i64> [[SUB_I]]
18580 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
18581   return vsubq_s64(a, b);
18582 }
18583
18584 // CHECK-LABEL: @test_vsubq_f32(
18585 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
18586 // CHECK:   ret <4 x float> [[SUB_I]]
18587 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
18588   return vsubq_f32(a, b);
18589 }
18590
18591 // CHECK-LABEL: @test_vsubq_u8(
18592 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
18593 // CHECK:   ret <16 x i8> [[SUB_I]]
18594 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
18595   return vsubq_u8(a, b);
18596 }
18597
18598 // CHECK-LABEL: @test_vsubq_u16(
18599 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
18600 // CHECK:   ret <8 x i16> [[SUB_I]]
18601 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
18602   return vsubq_u16(a, b);
18603 }
18604
18605 // CHECK-LABEL: @test_vsubq_u32(
18606 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
18607 // CHECK:   ret <4 x i32> [[SUB_I]]
18608 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
18609   return vsubq_u32(a, b);
18610 }
18611
18612 // CHECK-LABEL: @test_vsubq_u64(
18613 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
18614 // CHECK:   ret <2 x i64> [[SUB_I]]
18615 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
18616   return vsubq_u64(a, b);
18617 }
18618
18619 // CHECK-LABEL: @test_vsubhn_s16(
18620 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
18621 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18622 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
18623 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
18624 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
18625 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
18626 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
18627   return vsubhn_s16(a, b);
18628 }
18629
18630 // CHECK-LABEL: @test_vsubhn_s32(
18631 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
18632 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18633 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
18634 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
18635 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
18636 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
18637 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
18638   return vsubhn_s32(a, b);
18639 }
18640
18641 // CHECK-LABEL: @test_vsubhn_s64(
18642 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
18643 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18644 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
18645 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
18646 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
18647 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
18648 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
18649   return vsubhn_s64(a, b);
18650 }
18651
18652 // CHECK-LABEL: @test_vsubhn_u16(
18653 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
18654 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18655 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
18656 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
18657 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
18658 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
18659 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
18660   return vsubhn_u16(a, b);
18661 }
18662
18663 // CHECK-LABEL: @test_vsubhn_u32(
18664 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
18665 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18666 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
18667 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
18668 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
18669 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
18670 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
18671   return vsubhn_u32(a, b);
18672 }
18673
18674 // CHECK-LABEL: @test_vsubhn_u64(
18675 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
18676 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18677 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
18678 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
18679 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
18680 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
18681 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
18682   return vsubhn_u64(a, b);
18683 }
18684
18685 // CHECK-LABEL: @test_vsubl_s8(
18686 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
18687 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
18688 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18689 // CHECK:   ret <8 x i16> [[SUB_I]]
18690 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
18691   return vsubl_s8(a, b);
18692 }
18693
18694 // CHECK-LABEL: @test_vsubl_s16(
18695 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
18696 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
18697 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18698 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
18699 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18700 // CHECK:   ret <4 x i32> [[SUB_I]]
18701 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
18702   return vsubl_s16(a, b);
18703 }
18704
18705 // CHECK-LABEL: @test_vsubl_s32(
18706 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
18707 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
18708 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18709 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
18710 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18711 // CHECK:   ret <2 x i64> [[SUB_I]]
18712 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
18713   return vsubl_s32(a, b);
18714 }
18715
18716 // CHECK-LABEL: @test_vsubl_u8(
18717 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
18718 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
18719 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18720 // CHECK:   ret <8 x i16> [[SUB_I]]
18721 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
18722   return vsubl_u8(a, b);
18723 }
18724
18725 // CHECK-LABEL: @test_vsubl_u16(
18726 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
18727 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
18728 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18729 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
18730 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18731 // CHECK:   ret <4 x i32> [[SUB_I]]
18732 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
18733   return vsubl_u16(a, b);
18734 }
18735
18736 // CHECK-LABEL: @test_vsubl_u32(
18737 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
18738 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
18739 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18740 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
18741 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
18742 // CHECK:   ret <2 x i64> [[SUB_I]]
18743 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
18744   return vsubl_u32(a, b);
18745 }
18746
18747 // CHECK-LABEL: @test_vsubw_s8(
18748 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
18749 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
18750 // CHECK:   ret <8 x i16> [[SUB_I]]
18751 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
18752   return vsubw_s8(a, b);
18753 }
18754
18755 // CHECK-LABEL: @test_vsubw_s16(
18756 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18757 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
18758 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
18759 // CHECK:   ret <4 x i32> [[SUB_I]]
18760 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
18761   return vsubw_s16(a, b);
18762 }
18763
18764 // CHECK-LABEL: @test_vsubw_s32(
18765 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18766 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
18767 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
18768 // CHECK:   ret <2 x i64> [[SUB_I]]
18769 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
18770   return vsubw_s32(a, b);
18771 }
18772
18773 // CHECK-LABEL: @test_vsubw_u8(
18774 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
18775 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
18776 // CHECK:   ret <8 x i16> [[SUB_I]]
18777 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
18778   return vsubw_u8(a, b);
18779 }
18780
18781 // CHECK-LABEL: @test_vsubw_u16(
18782 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18783 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
18784 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
18785 // CHECK:   ret <4 x i32> [[SUB_I]]
18786 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
18787   return vsubw_u16(a, b);
18788 }
18789
18790 // CHECK-LABEL: @test_vsubw_u32(
18791 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18792 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
18793 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
18794 // CHECK:   ret <2 x i64> [[SUB_I]]
18795 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
18796   return vsubw_u32(a, b);
18797 }
18798
18799 // CHECK-LABEL: @test_vtbl1_u8(
18800 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
18801 // CHECK:   ret <8 x i8> [[VTBL1_I]]
18802 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
18803   return vtbl1_u8(a, b);
18804 }
18805
18806 // CHECK-LABEL: @test_vtbl1_s8(
18807 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
18808 // CHECK:   ret <8 x i8> [[VTBL1_I]]
18809 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
18810   return vtbl1_s8(a, b);
18811 }
18812
18813 // CHECK-LABEL: @test_vtbl1_p8(
18814 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
18815 // CHECK:   ret <8 x i8> [[VTBL1_I]]
18816 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
18817   return vtbl1_p8(a, b);
18818 }
18819
18820 // CHECK-LABEL: @test_vtbl2_u8(
18821 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
18822 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
18823 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0
18824 // CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18825 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0
18826 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
18827 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18828 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18829 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18830 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18831 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
18832 // CHECK:   ret <8 x i8> [[VTBL2_I]]
18833 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
18834   return vtbl2_u8(a, b);
18835 }
18836
18837 // CHECK-LABEL: @test_vtbl2_s8(
18838 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
18839 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
18840 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0
18841 // CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18842 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0
18843 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
18844 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18845 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18846 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18847 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18848 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
18849 // CHECK:   ret <8 x i8> [[VTBL2_I]]
18850 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
18851   return vtbl2_s8(a, b);
18852 }
18853
18854 // CHECK-LABEL: @test_vtbl2_p8(
18855 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
18856 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
18857 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0
18858 // CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18859 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0
18860 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
18861 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18862 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18863 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18864 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18865 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
18866 // CHECK:   ret <8 x i8> [[VTBL2_I]]
18867 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
18868   return vtbl2_p8(a, b);
18869 }
18870
18871 // CHECK-LABEL: @test_vtbl3_u8(
18872 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
18873 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
18874 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0
18875 // CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18876 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0
18877 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
18878 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18879 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18880 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18881 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18882 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18883 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18884 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
18885 // CHECK:   ret <8 x i8> [[VTBL3_I]]
18886 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
18887   return vtbl3_u8(a, b);
18888 }
18889
18890 // CHECK-LABEL: @test_vtbl3_s8(
18891 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
18892 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
18893 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0
18894 // CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18895 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0
18896 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
18897 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18898 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18899 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18900 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18901 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18902 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18903 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
18904 // CHECK:   ret <8 x i8> [[VTBL3_I]]
18905 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
18906   return vtbl3_s8(a, b);
18907 }
18908
18909 // CHECK-LABEL: @test_vtbl3_p8(
18910 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
18911 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
18912 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0
18913 // CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18914 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0
18915 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
18916 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18917 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18918 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18919 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18920 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18921 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18922 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
18923 // CHECK:   ret <8 x i8> [[VTBL3_I]]
18924 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
18925   return vtbl3_p8(a, b);
18926 }
18927
18928 // CHECK-LABEL: @test_vtbl4_u8(
18929 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
18930 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
18931 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0
18932 // CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18933 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0
18934 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
18935 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18936 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18937 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18938 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18939 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18940 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18941 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
18942 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
18943 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
18944 // CHECK:   ret <8 x i8> [[VTBL4_I]]
18945 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
18946   return vtbl4_u8(a, b);
18947 }
18948
18949 // CHECK-LABEL: @test_vtbl4_s8(
18950 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
18951 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
18952 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0
18953 // CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18954 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0
18955 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
18956 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18957 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18958 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18959 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18960 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18961 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18962 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
18963 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
18964 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
18965 // CHECK:   ret <8 x i8> [[VTBL4_I]]
18966 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
18967   return vtbl4_s8(a, b);
18968 }
18969
18970 // CHECK-LABEL: @test_vtbl4_p8(
18971 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
18972 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
18973 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0
18974 // CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
18975 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0
18976 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
18977 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
18978 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
18979 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
18980 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
18981 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
18982 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
18983 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
18984 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
18985 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
18986 // CHECK:   ret <8 x i8> [[VTBL4_I]]
18987 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
18988   return vtbl4_p8(a, b);
18989 }
18990
18991 // CHECK-LABEL: @test_vtbx1_u8(
18992 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
18993 // CHECK:   ret <8 x i8> [[VTBX1_I]]
18994 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
18995   return vtbx1_u8(a, b, c);
18996 }
18997
18998 // CHECK-LABEL: @test_vtbx1_s8(
18999 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
19000 // CHECK:   ret <8 x i8> [[VTBX1_I]]
19001 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
19002   return vtbx1_s8(a, b, c);
19003 }
19004
19005 // CHECK-LABEL: @test_vtbx1_p8(
19006 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
19007 // CHECK:   ret <8 x i8> [[VTBX1_I]]
19008 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
19009   return vtbx1_p8(a, b, c);
19010 }
19011
19012 // CHECK-LABEL: @test_vtbx2_u8(
19013 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
19014 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
19015 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
19016 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19017 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
19018 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
19019 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19020 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19021 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19022 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19023 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19024 // CHECK:   ret <8 x i8> [[VTBX2_I]]
19025 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
19026   return vtbx2_u8(a, b, c);
19027 }
19028
19029 // CHECK-LABEL: @test_vtbx2_s8(
19030 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
19031 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
19032 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
19033 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19034 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
19035 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
19036 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19037 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19038 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19039 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19040 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19041 // CHECK:   ret <8 x i8> [[VTBX2_I]]
19042 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
19043   return vtbx2_s8(a, b, c);
19044 }
19045
19046 // CHECK-LABEL: @test_vtbx2_p8(
19047 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
19048 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
19049 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
19050 // CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19051 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
19052 // CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
19053 // CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19054 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19055 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19056 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19057 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19058 // CHECK:   ret <8 x i8> [[VTBX2_I]]
19059 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
19060   return vtbx2_p8(a, b, c);
19061 }
19062
19063 // CHECK-LABEL: @test_vtbx3_u8(
19064 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
19065 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
19066 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
19067 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19068 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
19069 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
19070 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19071 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19072 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19073 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19074 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19075 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19076 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
19077 // CHECK:   ret <8 x i8> [[VTBX3_I]]
19078 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
19079   return vtbx3_u8(a, b, c);
19080 }
19081
19082 // CHECK-LABEL: @test_vtbx3_s8(
19083 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
19084 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
19085 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
19086 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19087 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
19088 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
19089 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19090 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19091 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19092 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19093 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19094 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19095 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
19096 // CHECK:   ret <8 x i8> [[VTBX3_I]]
19097 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
19098   return vtbx3_s8(a, b, c);
19099 }
19100
19101 // CHECK-LABEL: @test_vtbx3_p8(
19102 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
19103 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
19104 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
19105 // CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19106 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
19107 // CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
19108 // CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19109 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19110 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19111 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19112 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19113 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19114 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
19115 // CHECK:   ret <8 x i8> [[VTBX3_I]]
19116 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
19117   return vtbx3_p8(a, b, c);
19118 }
19119
19120 // CHECK-LABEL: @test_vtbx4_u8(
19121 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
19122 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
19123 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
19124 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19125 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
19126 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
19127 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19128 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19129 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19130 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19131 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19132 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19133 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
19134 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
19135 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
19136 // CHECK:   ret <8 x i8> [[VTBX4_I]]
19137 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
19138   return vtbx4_u8(a, b, c);
19139 }
19140
19141 // CHECK-LABEL: @test_vtbx4_s8(
19142 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
19143 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
19144 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
19145 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19146 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
19147 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
19148 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19149 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19150 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19151 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19152 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19153 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19154 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
19155 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
19156 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
19157 // CHECK:   ret <8 x i8> [[VTBX4_I]]
19158 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
19159   return vtbx4_s8(a, b, c);
19160 }
19161
19162 // CHECK-LABEL: @test_vtbx4_p8(
19163 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
19164 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
19165 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
19166 // CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
19167 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
19168 // CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
19169 // CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
19170 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
19171 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
19172 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
19173 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
19174 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
19175 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
19176 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
19177 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
19178 // CHECK:   ret <8 x i8> [[VTBX4_I]]
19179 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
19180   return vtbx4_p8(a, b, c);
19181 }
19182
19183 // CHECK: @test_vtrn_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19184 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19185 // CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19186 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19187 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19188 // CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19189 // CHECK:   ret void
19190 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
19191   return vtrn_s8(a, b);
19192 }
19193
19194 // CHECK: @test_vtrn_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19195 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19196 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19197 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19198 // CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19199 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19200 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19201 // CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19202 // CHECK:   ret void
19203 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
19204   return vtrn_s16(a, b);
19205 }
19206
19207 // CHECK: @test_vtrn_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19208 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19209 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19210 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19211 // CHECK:   store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19212 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19213 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19214 // CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19215 // CHECK:   ret void
19216 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
19217   return vtrn_s32(a, b);
19218 }
19219
19220 // CHECK: @test_vtrn_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19221 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19222 // CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19223 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19224 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19225 // CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19226 // CHECK:   ret void
19227 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
19228   return vtrn_u8(a, b);
19229 }
19230
19231 // CHECK: @test_vtrn_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19232 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19233 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19234 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19235 // CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19236 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19237 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19238 // CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19239 // CHECK:   ret void
19240 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
19241   return vtrn_u16(a, b);
19242 }
19243
19244 // CHECK: @test_vtrn_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19245 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19246 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19247 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19248 // CHECK:   store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19249 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19250 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19251 // CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19252 // CHECK:   ret void
19253 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
19254   return vtrn_u32(a, b);
19255 }
19256
19257 // CHECK: @test_vtrn_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19258 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
19259 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
19260 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
19261 // CHECK:   store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19262 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
19263 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
19264 // CHECK:   store <2 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19265 // CHECK:   ret void
19266 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
19267   return vtrn_f32(a, b);
19268 }
19269
19270 // CHECK: @test_vtrn_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19271 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19272 // CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19273 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19274 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19275 // CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19276 // CHECK:   ret void
19277 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
19278   return vtrn_p8(a, b);
19279 }
19280
19281 // CHECK: @test_vtrn_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19282 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19283 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19284 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19285 // CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19286 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19287 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19288 // CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19289 // CHECK:   ret void
19290 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
19291   return vtrn_p16(a, b);
19292 }
19293
19294 // CHECK: @test_vtrnq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19295 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
19296 // CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19297 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19298 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
19299 // CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19300 // CHECK:   ret void
19301 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
19302   return vtrnq_s8(a, b);
19303 }
19304
19305 // CHECK: @test_vtrnq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19306 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19307 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19308 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19309 // CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19310 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19311 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19312 // CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19313 // CHECK:   ret void
19314 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
19315   return vtrnq_s16(a, b);
19316 }
19317
19318 // CHECK: @test_vtrnq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19319 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19320 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19321 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19322 // CHECK:   store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19323 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19324 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19325 // CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19326 // CHECK:   ret void
19327 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
19328   return vtrnq_s32(a, b);
19329 }
19330
19331 // CHECK: @test_vtrnq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19332 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
19333 // CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19334 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19335 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
19336 // CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19337 // CHECK:   ret void
19338 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
19339   return vtrnq_u8(a, b);
19340 }
19341
19342 // CHECK: @test_vtrnq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19343 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19344 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19345 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19346 // CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19347 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19348 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19349 // CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19350 // CHECK:   ret void
19351 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
19352   return vtrnq_u16(a, b);
19353 }
19354
19355 // CHECK: @test_vtrnq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19356 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19357 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19358 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19359 // CHECK:   store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19360 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19361 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19362 // CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19363 // CHECK:   ret void
19364 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
19365   return vtrnq_u32(a, b);
19366 }
19367
19368 // CHECK: @test_vtrnq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19369 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
19370 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
19371 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
19372 // CHECK:   store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19373 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
19374 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
19375 // CHECK:   store <4 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19376 // CHECK:   ret void
19377 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
19378   return vtrnq_f32(a, b);
19379 }
19380
19381 // CHECK: @test_vtrnq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19382 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
19383 // CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19384 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19385 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
19386 // CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
19387 // CHECK:   ret void
19388 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
19389   return vtrnq_p8(a, b);
19390 }
19391
19392 // CHECK: @test_vtrnq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19393 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19394 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19395 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
19396 // CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19397 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19398 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
19399 // CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
19400 // CHECK:   ret void
19401 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
19402   return vtrnq_p16(a, b);
19403 }
19404
19405 // CHECK-LABEL: @test_vtst_s8(
19406 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
19407 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
19408 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
19409 // CHECK:   ret <8 x i8> [[VTST_I]]
19410 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
19411   return vtst_s8(a, b);
19412 }
19413
19414 // CHECK-LABEL: @test_vtst_s16(
19415 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19416 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19417 // CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
19418 // CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
19419 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
19420 // CHECK:   ret <4 x i16> [[VTST_I]]
19421 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
19422   return vtst_s16(a, b);
19423 }
19424
19425 // CHECK-LABEL: @test_vtst_s32(
19426 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19427 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19428 // CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
19429 // CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
19430 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
19431 // CHECK:   ret <2 x i32> [[VTST_I]]
19432 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
19433   return vtst_s32(a, b);
19434 }
19435
19436 // CHECK-LABEL: @test_vtst_u8(
19437 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
19438 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
19439 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
19440 // CHECK:   ret <8 x i8> [[VTST_I]]
19441 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
19442   return vtst_u8(a, b);
19443 }
19444
19445 // CHECK-LABEL: @test_vtst_u16(
19446 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19447 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19448 // CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
19449 // CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
19450 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
19451 // CHECK:   ret <4 x i16> [[VTST_I]]
19452 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
19453   return vtst_u16(a, b);
19454 }
19455
19456 // CHECK-LABEL: @test_vtst_u32(
19457 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19458 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19459 // CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
19460 // CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
19461 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
19462 // CHECK:   ret <2 x i32> [[VTST_I]]
19463 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
19464   return vtst_u32(a, b);
19465 }
19466
19467 // CHECK-LABEL: @test_vtst_p8(
19468 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
19469 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
19470 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
19471 // CHECK:   ret <8 x i8> [[VTST_I]]
19472 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
19473   return vtst_p8(a, b);
19474 }
19475
19476 // CHECK-LABEL: @test_vtst_p16(
19477 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19478 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19479 // CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
19480 // CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
19481 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
19482 // CHECK:   ret <4 x i16> [[VTST_I]]
19483 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
19484   return vtst_p16(a, b);
19485 }
19486
19487 // CHECK-LABEL: @test_vtstq_s8(
19488 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
19489 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
19490 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
19491 // CHECK:   ret <16 x i8> [[VTST_I]]
19492 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
19493   return vtstq_s8(a, b);
19494 }
19495
19496 // CHECK-LABEL: @test_vtstq_s16(
19497 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19498 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19499 // CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
19500 // CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
19501 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
19502 // CHECK:   ret <8 x i16> [[VTST_I]]
19503 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
19504   return vtstq_s16(a, b);
19505 }
19506
19507 // CHECK-LABEL: @test_vtstq_s32(
19508 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19509 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19510 // CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
19511 // CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
19512 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
19513 // CHECK:   ret <4 x i32> [[VTST_I]]
19514 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
19515   return vtstq_s32(a, b);
19516 }
19517
19518 // CHECK-LABEL: @test_vtstq_u8(
19519 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
19520 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
19521 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
19522 // CHECK:   ret <16 x i8> [[VTST_I]]
19523 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
19524   return vtstq_u8(a, b);
19525 }
19526
19527 // CHECK-LABEL: @test_vtstq_u16(
19528 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19529 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19530 // CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
19531 // CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
19532 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
19533 // CHECK:   ret <8 x i16> [[VTST_I]]
19534 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
19535   return vtstq_u16(a, b);
19536 }
19537
19538 // CHECK-LABEL: @test_vtstq_u32(
19539 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19540 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19541 // CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
19542 // CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
19543 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
19544 // CHECK:   ret <4 x i32> [[VTST_I]]
19545 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
19546   return vtstq_u32(a, b);
19547 }
19548
19549 // CHECK-LABEL: @test_vtstq_p8(
19550 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
19551 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
19552 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
19553 // CHECK:   ret <16 x i8> [[VTST_I]]
19554 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
19555   return vtstq_p8(a, b);
19556 }
19557
19558 // CHECK-LABEL: @test_vtstq_p16(
19559 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19560 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19561 // CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
19562 // CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
19563 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
19564 // CHECK:   ret <8 x i16> [[VTST_I]]
19565 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
19566   return vtstq_p16(a, b);
19567 }
19568
19569 // CHECK: @test_vuzp_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19570 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19571 // CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19572 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19573 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19574 // CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19575 // CHECK:   ret void
19576 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
19577   return vuzp_s8(a, b);
19578 }
19579
19580 // CHECK: @test_vuzp_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19582 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19583 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19584 // CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19585 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19586 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19587 // CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19588 // CHECK:   ret void
19589 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
19590   return vuzp_s16(a, b);
19591 }
19592
19593 // CHECK: @test_vuzp_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19594 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19595 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19596 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19597 // CHECK:   store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19598 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19599 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19600 // CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19601 // CHECK:   ret void
19602 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
19603   return vuzp_s32(a, b);
19604 }
19605
19606 // CHECK: @test_vuzp_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19607 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19608 // CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19609 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19610 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19611 // CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19612 // CHECK:   ret void
19613 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
19614   return vuzp_u8(a, b);
19615 }
19616
19617 // CHECK: @test_vuzp_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19618 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19619 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19620 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19621 // CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19622 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19623 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19624 // CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19625 // CHECK:   ret void
19626 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
19627   return vuzp_u16(a, b);
19628 }
19629
19630 // CHECK: @test_vuzp_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19631 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19632 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19633 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19634 // CHECK:   store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19635 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19636 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19637 // CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19638 // CHECK:   ret void
19639 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
19640   return vuzp_u32(a, b);
19641 }
19642
19643 // CHECK: @test_vuzp_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19644 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
19645 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
19646 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
19647 // CHECK:   store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19648 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
19649 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
19650 // CHECK:   store <2 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19651 // CHECK:   ret void
19652 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
19653   return vuzp_f32(a, b);
19654 }
19655
19656 // CHECK: @test_vuzp_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19657 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19658 // CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19659 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19660 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19661 // CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19662 // CHECK:   ret void
19663 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
19664   return vuzp_p8(a, b);
19665 }
19666
19667 // CHECK: @test_vuzp_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19668 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19669 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19670 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19671 // CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19672 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19673 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19674 // CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19675 // CHECK:   ret void
19676 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
19677   return vuzp_p16(a, b);
19678 }
19679
19680 // CHECK: @test_vuzpq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19681 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
19682 // CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19683 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19684 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
19685 // CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19686 // CHECK:   ret void
19687 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
19688   return vuzpq_s8(a, b);
19689 }
19690
19691 // CHECK: @test_vuzpq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19692 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19693 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19694 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19695 // CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19696 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19697 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19698 // CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19699 // CHECK:   ret void
19700 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
19701   return vuzpq_s16(a, b);
19702 }
19703
19704 // CHECK: @test_vuzpq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19705 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19706 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19707 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19708 // CHECK:   store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19709 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19710 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19711 // CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19712 // CHECK:   ret void
19713 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
19714   return vuzpq_s32(a, b);
19715 }
19716
19717 // CHECK: @test_vuzpq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19718 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
19719 // CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19720 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19721 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
19722 // CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19723 // CHECK:   ret void
19724 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
19725   return vuzpq_u8(a, b);
19726 }
19727
19728 // CHECK: @test_vuzpq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19729 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19730 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19731 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19732 // CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19733 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19734 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19735 // CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19736 // CHECK:   ret void
19737 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
19738   return vuzpq_u16(a, b);
19739 }
19740
19741 // CHECK: @test_vuzpq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19742 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19743 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19744 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19745 // CHECK:   store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19746 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19747 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19748 // CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19749 // CHECK:   ret void
19750 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
19751   return vuzpq_u32(a, b);
19752 }
19753
19754 // CHECK: @test_vuzpq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19755 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
19756 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
19757 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
19758 // CHECK:   store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19759 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
19760 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
19761 // CHECK:   store <4 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19762 // CHECK:   ret void
19763 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
19764   return vuzpq_f32(a, b);
19765 }
19766
19767 // CHECK: @test_vuzpq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19768 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
19769 // CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19770 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19771 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
19772 // CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
19773 // CHECK:   ret void
19774 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
19775   return vuzpq_p8(a, b);
19776 }
19777
19778 // CHECK: @test_vuzpq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19779 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19780 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19781 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19782 // CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19783 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19784 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
19785 // CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
19786 // CHECK:   ret void
19787 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
19788   return vuzpq_p16(a, b);
19789 }
19790
19791 // CHECK: @test_vzip_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19792 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
19793 // CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19794 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19795 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
19796 // CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19797 // CHECK:   ret void
19798 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
19799   return vzip_s8(a, b);
19800 }
19801
19802 // CHECK: @test_vzip_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19803 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19804 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19805 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19806 // CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19807 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19808 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19809 // CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19810 // CHECK:   ret void
19811 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
19812   return vzip_s16(a, b);
19813 }
19814
19815 // CHECK: @test_vzip_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19816 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19817 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19818 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19819 // CHECK:   store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19820 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19821 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19822 // CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19823 // CHECK:   ret void
19824 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
19825   return vzip_s32(a, b);
19826 }
19827
19828 // CHECK: @test_vzip_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19829 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
19830 // CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19831 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19832 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
19833 // CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19834 // CHECK:   ret void
19835 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
19836   return vzip_u8(a, b);
19837 }
19838
19839 // CHECK: @test_vzip_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19840 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19841 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19842 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19843 // CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19844 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19845 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19846 // CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19847 // CHECK:   ret void
19848 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
19849   return vzip_u16(a, b);
19850 }
19851
19852 // CHECK: @test_vzip_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19853 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19854 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19855 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
19856 // CHECK:   store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19857 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
19858 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
19859 // CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19860 // CHECK:   ret void
19861 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
19862   return vzip_u32(a, b);
19863 }
19864
19865 // CHECK: @test_vzip_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19866 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
19867 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
19868 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
19869 // CHECK:   store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19870 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
19871 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
19872 // CHECK:   store <2 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19873 // CHECK:   ret void
19874 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
19875   return vzip_f32(a, b);
19876 }
19877
19878 // CHECK: @test_vzip_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19879 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
19880 // CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19881 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
19882 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
19883 // CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19884 // CHECK:   ret void
19885 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
19886   return vzip_p8(a, b);
19887 }
19888
19889 // CHECK: @test_vzip_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19890 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19891 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19892 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19893 // CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19894 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
19895 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19896 // CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19897 // CHECK:   ret void
19898 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
19899   return vzip_p16(a, b);
19900 }
19901
19902 // CHECK: @test_vzipq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19903 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
19904 // CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19905 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19906 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
19907 // CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19908 // CHECK:   ret void
19909 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
19910   return vzipq_s8(a, b);
19911 }
19912
19913 // CHECK: @test_vzipq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19914 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19915 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19916 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
19917 // CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19918 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19919 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
19920 // CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19921 // CHECK:   ret void
19922 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
19923   return vzipq_s16(a, b);
19924 }
19925
19926 // CHECK: @test_vzipq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19927 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19928 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19929 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19930 // CHECK:   store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19931 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19932 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19933 // CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19934 // CHECK:   ret void
19935 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
19936   return vzipq_s32(a, b);
19937 }
19938
19939 // CHECK: @test_vzipq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19940 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
19941 // CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19942 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19943 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
19944 // CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19945 // CHECK:   ret void
19946 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
19947   return vzipq_u8(a, b);
19948 }
19949
19950 // CHECK: @test_vzipq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19951 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19952 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19953 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
19954 // CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19955 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
19956 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
19957 // CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19958 // CHECK:   ret void
19959 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
19960   return vzipq_u16(a, b);
19961 }
19962
19963 // CHECK: @test_vzipq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19964 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19965 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19966 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19967 // CHECK:   store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19968 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
19969 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19970 // CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19971 // CHECK:   ret void
19972 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
19973   return vzipq_u32(a, b);
19974 }
19975
19976 // CHECK: @test_vzipq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19977 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
19978 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
19979 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
19980 // CHECK:   store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19981 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
19982 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19983 // CHECK:   store <4 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
19984 // CHECK:   ret void
19985 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
19986   return vzipq_f32(a, b);
19987 }
19988
19989 // CHECK: @test_vzipq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
19990 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
19991 // CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
19992 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
19993 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
19994 // CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
19995 // CHECK:   ret void
19996 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
19997   return vzipq_p8(a, b);
19998 }
19999
20000 // CHECK: @test_vzipq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20001 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20002 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20003 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
20004 // CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
20005 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
20006 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
20007 // CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
20008 // CHECK:   ret void
20009 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
20010   return vzipq_p16(a, b);
20011 }