clang/test/CodeGen/AArch64/neon-2velem.c

   1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
   2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
   3
   4 // REQUIRES: aarch64-registered-target || arm-registered-target
   5
   6 #include <arm_neon.h>
   7
   8 // CHECK-LABEL: @test_vmla_lane_s16(
   9 // CHECK-NEXT:  entry:
  10 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
  11 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
  12 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  13 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
  14 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
  15 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
  16 //
  17 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
  18   return vmla_lane_s16(a, b, v, 3);
  19 }
  20
  21 // CHECK-LABEL: @test_vmlaq_lane_s16(
  22 // CHECK-NEXT:  entry:
  23 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
  24 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
  25 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  26 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
  27 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
  28 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
  29 //
  30 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
  31   return vmlaq_lane_s16(a, b, v, 3);
  32 }
  33
  34 // CHECK-LABEL: @test_vmla_lane_s32(
  35 // CHECK-NEXT:  entry:
  36 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
  37 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
  38 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
  39 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
  40 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
  41 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
  42 //
  43 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
  44   return vmla_lane_s32(a, b, v, 1);
  45 }
  46
  47 // CHECK-LABEL: @test_vmlaq_lane_s32(
  48 // CHECK-NEXT:  entry:
  49 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
  50 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
  51 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  52 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
  53 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
  54 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
  55 //
  56 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
  57   return vmlaq_lane_s32(a, b, v, 1);
  58 }
  59
  60 // CHECK-LABEL: @test_vmla_laneq_s16(
  61 // CHECK-NEXT:  entry:
  62 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
  63 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
  64 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
  65 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
  66 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
  67 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
  68 //
  69 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
  70   return vmla_laneq_s16(a, b, v, 7);
  71 }
  72
  73 // CHECK-LABEL: @test_vmlaq_laneq_s16(
  74 // CHECK-NEXT:  entry:
  75 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
  76 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
  77 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  78 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
  79 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
  80 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
  81 //
  82 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
  83   return vmlaq_laneq_s16(a, b, v, 7);
  84 }
  85
  86 // CHECK-LABEL: @test_vmla_laneq_s32(
  87 // CHECK-NEXT:  entry:
  88 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
  89 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
  90 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
  91 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
  92 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
  93 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
  94 //
  95 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
  96   return vmla_laneq_s32(a, b, v, 3);
  97 }
  98
  99 // CHECK-LABEL: @test_vmlaq_laneq_s32(
 100 // CHECK-NEXT:  entry:
 101 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 102 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 103 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 104 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 105 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 106 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 107 //
 108 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 109   return vmlaq_laneq_s32(a, b, v, 3);
 110 }
 111
 112 // CHECK-LABEL: @test_vmls_lane_s16(
 113 // CHECK-NEXT:  entry:
 114 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 115 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 116 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 117 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 118 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 119 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 120 //
 121 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
 122   return vmls_lane_s16(a, b, v, 3);
 123 }
 124
 125 // CHECK-LABEL: @test_vmlsq_lane_s16(
 126 // CHECK-NEXT:  entry:
 127 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 128 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 129 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 130 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 131 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 132 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 133 //
 134 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
 135   return vmlsq_lane_s16(a, b, v, 3);
 136 }
 137
 138 // CHECK-LABEL: @test_vmls_lane_s32(
 139 // CHECK-NEXT:  entry:
 140 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 141 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 142 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 143 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 144 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 145 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 146 //
 147 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 148   return vmls_lane_s32(a, b, v, 1);
 149 }
 150
 151 // CHECK-LABEL: @test_vmlsq_lane_s32(
 152 // CHECK-NEXT:  entry:
 153 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 154 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 155 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 156 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 157 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 158 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 159 //
 160 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 161   return vmlsq_lane_s32(a, b, v, 1);
 162 }
 163
 164 // CHECK-LABEL: @test_vmls_laneq_s16(
 165 // CHECK-NEXT:  entry:
 166 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 167 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 168 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 169 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 170 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 171 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
 172 //
 173 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 174   return vmls_laneq_s16(a, b, v, 7);
 175 }
 176
 177 // CHECK-LABEL: @test_vmlsq_laneq_s16(
 178 // CHECK-NEXT:  entry:
 179 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 180 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 181 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 182 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 183 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 184 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
 185 //
 186 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 187   return vmlsq_laneq_s16(a, b, v, 7);
 188 }
 189
 190 // CHECK-LABEL: @test_vmls_laneq_s32(
 191 // CHECK-NEXT:  entry:
 192 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 193 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 194 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 195 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 196 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 197 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
 198 //
 199 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 200   return vmls_laneq_s32(a, b, v, 3);
 201 }
 202
 203 // CHECK-LABEL: @test_vmlsq_laneq_s32(
 204 // CHECK-NEXT:  entry:
 205 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 206 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 207 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 208 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 209 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 210 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 211 //
 212 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 213   return vmlsq_laneq_s32(a, b, v, 3);
 214 }
 215
 216 // CHECK-LABEL: @test_vmul_lane_s16(
 217 // CHECK-NEXT:  entry:
 218 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 219 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 220 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 221 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 222 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 223 //
 224 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
 225   return vmul_lane_s16(a, v, 3);
 226 }
 227
 228 // CHECK-LABEL: @test_vmulq_lane_s16(
 229 // CHECK-NEXT:  entry:
 230 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 231 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 232 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 233 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 234 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 235 //
 236 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
 237   return vmulq_lane_s16(a, v, 3);
 238 }
 239
 240 // CHECK-LABEL: @test_vmul_lane_s32(
 241 // CHECK-NEXT:  entry:
 242 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 243 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 244 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 245 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 246 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 247 //
 248 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
 249   return vmul_lane_s32(a, v, 1);
 250 }
 251
 252 // CHECK-LABEL: @test_vmulq_lane_s32(
 253 // CHECK-NEXT:  entry:
 254 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 255 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 256 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 257 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 258 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 259 //
 260 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
 261   return vmulq_lane_s32(a, v, 1);
 262 }
 263
 264 // CHECK-LABEL: @test_vmul_lane_u16(
 265 // CHECK-NEXT:  entry:
 266 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 267 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 268 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 269 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 270 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 271 //
 272 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
 273   return vmul_lane_u16(a, v, 3);
 274 }
 275
 276 // CHECK-LABEL: @test_vmulq_lane_u16(
 277 // CHECK-NEXT:  entry:
 278 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 279 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 280 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 281 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 282 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 283 //
 284 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
 285   return vmulq_lane_u16(a, v, 3);
 286 }
 287
 288 // CHECK-LABEL: @test_vmul_lane_u32(
 289 // CHECK-NEXT:  entry:
 290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 292 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 293 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 294 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 295 //
 296 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
 297   return vmul_lane_u32(a, v, 1);
 298 }
 299
 300 // CHECK-LABEL: @test_vmulq_lane_u32(
 301 // CHECK-NEXT:  entry:
 302 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 303 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 304 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 305 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 306 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 307 //
 308 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
 309   return vmulq_lane_u32(a, v, 1);
 310 }
 311
 312 // CHECK-LABEL: @test_vmul_laneq_s16(
 313 // CHECK-NEXT:  entry:
 314 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 315 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 316 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 317 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 318 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 319 //
 320 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
 321   return vmul_laneq_s16(a, v, 7);
 322 }
 323
 324 // CHECK-LABEL: @test_vmulq_laneq_s16(
 325 // CHECK-NEXT:  entry:
 326 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 327 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 328 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 329 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 330 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 331 //
 332 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
 333   return vmulq_laneq_s16(a, v, 7);
 334 }
 335
 336 // CHECK-LABEL: @test_vmul_laneq_s32(
 337 // CHECK-NEXT:  entry:
 338 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 339 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 340 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 341 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 342 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 343 //
 344 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
 345   return vmul_laneq_s32(a, v, 3);
 346 }
 347
 348 // CHECK-LABEL: @test_vmulq_laneq_s32(
 349 // CHECK-NEXT:  entry:
 350 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 351 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 352 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 353 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 354 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 355 //
 356 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
 357   return vmulq_laneq_s32(a, v, 3);
 358 }
 359
 360 // CHECK-LABEL: @test_vmul_laneq_u16(
 361 // CHECK-NEXT:  entry:
 362 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 363 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 365 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 366 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 367 //
 368 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
 369   return vmul_laneq_u16(a, v, 7);
 370 }
 371
 372 // CHECK-LABEL: @test_vmulq_laneq_u16(
 373 // CHECK-NEXT:  entry:
 374 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 375 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 376 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 377 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 378 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 379 //
 380 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
 381   return vmulq_laneq_u16(a, v, 7);
 382 }
 383
 384 // CHECK-LABEL: @test_vmul_laneq_u32(
 385 // CHECK-NEXT:  entry:
 386 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 387 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 388 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 389 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 390 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 391 //
 392 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 393   return vmul_laneq_u32(a, v, 3);
 394 }
 395
 396 // CHECK-LABEL: @test_vmulq_laneq_u32(
 397 // CHECK-NEXT:  entry:
 398 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 399 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 400 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 401 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 402 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 403 //
 404 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
 405   return vmulq_laneq_u32(a, v, 3);
 406 }
 407
 408 // CHECK-LABEL: @test_vfma_lane_f32(
 409 // CHECK-NEXT:  entry:
 410 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 411 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
 412 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
 413 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 415 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 416 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 417 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 418 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 419 //
 420 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 421   return vfma_lane_f32(a, b, v, 1);
 422 }
 423
 424 // CHECK-LABEL: @test_vfmaq_lane_f32(
 425 // CHECK-NEXT:  entry:
 426 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 427 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
 428 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
 429 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 430 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 431 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 432 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 433 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 434 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 435 //
 436 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 437   return vfmaq_lane_f32(a, b, v, 1);
 438 }
 439
 440 // CHECK-LABEL: @test_vfma_laneq_f32(
 441 // CHECK-NEXT:  entry:
 442 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 443 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
 444 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
 445 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 446 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 447 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 448 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 449 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 450 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
 451 //
 452 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 453   return vfma_laneq_f32(a, b, v, 3);
 454 }
 455
 456 // CHECK-LABEL: @test_vfmaq_laneq_f32(
 457 // CHECK-NEXT:  entry:
 458 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 459 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
 460 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
 461 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 462 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 463 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 464 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 465 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 466 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
 467 //
 468 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 469   return vfmaq_laneq_f32(a, b, v, 3);
 470 }
 471
 472 // CHECK-LABEL: @test_vfms_lane_f32(
 473 // CHECK-NEXT:  entry:
 474 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
 475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
 477 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
 478 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 479 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 480 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 481 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 482 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 483 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 484 //
 485 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 486   return vfms_lane_f32(a, b, v, 1);
 487 }
 488
 489 // CHECK-LABEL: @test_vfmsq_lane_f32(
 490 // CHECK-NEXT:  entry:
 491 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
 492 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 493 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
 494 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
 495 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 496 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 497 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 498 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 499 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 500 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 501 //
 502 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 503   return vfmsq_lane_f32(a, b, v, 1);
 504 }
 505
 506 // CHECK-LABEL: @test_vfms_laneq_f32(
 507 // CHECK-NEXT:  entry:
 508 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
 509 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 510 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
 511 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
 512 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 513 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 514 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 515 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 516 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 517 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
 518 //
 519 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 520   return vfms_laneq_f32(a, b, v, 3);
 521 }
 522
 523 // CHECK-LABEL: @test_vfmsq_laneq_f32(
 524 // CHECK-NEXT:  entry:
 525 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
 526 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 527 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
 528 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
 529 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 530 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 531 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 532 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 533 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 534 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
 535 //
 536 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 537   return vfmsq_laneq_f32(a, b, v, 3);
 538 }
 539
 540 // CHECK-LABEL: @test_vfmaq_lane_f64(
 541 // CHECK-NEXT:  entry:
 542 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 543 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
 544 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
 545 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 546 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 547 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 548 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 549 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 550 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
 551 //
 552 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 553   return vfmaq_lane_f64(a, b, v, 0);
 554 }
 555
 556 // CHECK-LABEL: @test_vfmaq_laneq_f64(
 557 // CHECK-NEXT:  entry:
 558 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 559 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
 560 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
 561 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 562 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 563 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 564 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 565 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 566 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
 567 //
 568 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 569   return vfmaq_laneq_f64(a, b, v, 1);
 570 }
 571
 572 // CHECK-LABEL: @test_vfmsq_lane_f64(
 573 // CHECK-NEXT:  entry:
 574 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
 575 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 576 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
 577 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
 578 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 579 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 580 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 581 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 582 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 583 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
 584 //
 585 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 586   return vfmsq_lane_f64(a, b, v, 0);
 587 }
 588
 589 // CHECK-LABEL: @test_vfmsq_laneq_f64(
 590 // CHECK-NEXT:  entry:
 591 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
 592 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
 593 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
 594 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
 595 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 596 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 597 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 598 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 599 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 600 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
 601 //
 602 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 603   return vfmsq_laneq_f64(a, b, v, 1);
 604 }
 605
 606 // CHECK-LABEL: @test_vfmas_laneq_f32(
 607 // CHECK-NEXT:  entry:
 608 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
 609 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
 610 // CHECK-NEXT:    ret float [[TMP0]]
 611 //
 612 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 613   return vfmas_laneq_f32(a, b, v, 3);
 614 }
 615
 616 // CHECK-LABEL: @test_vfmsd_lane_f64(
 617 // CHECK-NEXT:  entry:
 618 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 619 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
 620 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
 621 // CHECK-NEXT:    ret double [[TMP0]]
 622 //
 623 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
 624   return vfmsd_lane_f64(a, b, v, 0);
 625 }
 626
 627 // CHECK-LABEL: @test_vfmss_laneq_f32(
 628 // CHECK-NEXT:  entry:
 629 // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B:%.*]]
 630 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
 631 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
 632 // CHECK-NEXT:    ret float [[TMP0]]
 633 //
 634 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 635   return vfmss_laneq_f32(a, b, v, 3);
 636 }
 637
 638 // CHECK-LABEL: @test_vfmsd_laneq_f64(
 639 // CHECK-NEXT:  entry:
 640 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 641 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
 642 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
 643 // CHECK-NEXT:    ret double [[TMP0]]
 644 //
 645 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 646   return vfmsd_laneq_f64(a, b, v, 1);
 647 }
 648
 649 // CHECK-LABEL: @test_vmlal_lane_s16(
 650 // CHECK-NEXT:  entry:
 651 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 652 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 653 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 654 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 655 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 656 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 657 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 658 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 659 //
 660 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 661   return vmlal_lane_s16(a, b, v, 3);
 662 }
 663
 664 // CHECK-LABEL: @test_vmlal_lane_s32(
 665 // CHECK-NEXT:  entry:
 666 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 667 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 668 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 669 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 670 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 671 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 672 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 673 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 674 //
 675 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 676   return vmlal_lane_s32(a, b, v, 1);
 677 }
 678
 679 // CHECK-LABEL: @test_vmlal_laneq_s16(
 680 // CHECK-NEXT:  entry:
 681 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 682 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 683 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 684 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 685 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 686 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 687 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 688 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 689 //
 690 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 691   return vmlal_laneq_s16(a, b, v, 7);
 692 }
 693
 694 // CHECK-LABEL: @test_vmlal_laneq_s32(
 695 // CHECK-NEXT:  entry:
 696 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 697 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 698 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 699 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 700 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 701 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 702 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 703 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 704 //
 705 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 706   return vmlal_laneq_s32(a, b, v, 3);
 707 }
 708
 709 // CHECK-LABEL: @test_vmlal_high_lane_s16(
 710 // CHECK-NEXT:  entry:
 711 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 712 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 713 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 714 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 715 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 716 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 717 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 718 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 719 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 720 //
 721 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 722   return vmlal_high_lane_s16(a, b, v, 3);
 723 }
 724
 725 // CHECK-LABEL: @test_vmlal_high_lane_s32(
 726 // CHECK-NEXT:  entry:
 727 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
 728 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 729 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 730 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 731 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 732 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 733 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 734 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 735 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 736 //
 737 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 738   return vmlal_high_lane_s32(a, b, v, 1);
 739 }
 740
 741 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
 742 // CHECK-NEXT:  entry:
 743 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 744 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 745 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 746 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 747 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 748 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 749 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 750 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 751 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 752 //
 753 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 754   return vmlal_high_laneq_s16(a, b, v, 7);
 755 }
 756
 757 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
 758 // CHECK-NEXT:  entry:
 759 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
 760 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 761 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 762 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 763 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 764 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 765 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 766 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 767 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 768 //
 769 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 770   return vmlal_high_laneq_s32(a, b, v, 3);
 771 }
 772
 773 // CHECK-LABEL: @test_vmlsl_lane_s16(
 774 // CHECK-NEXT:  entry:
 775 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 776 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 777 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 778 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 779 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 780 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 781 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 782 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 783 //
 784 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 785   return vmlsl_lane_s16(a, b, v, 3);
 786 }
 787
 788 // CHECK-LABEL: @test_vmlsl_lane_s32(
 789 // CHECK-NEXT:  entry:
 790 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 791 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 792 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 793 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 794 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 795 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 796 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 797 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 798 //
 799 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 800   return vmlsl_lane_s32(a, b, v, 1);
 801 }
 802
 803 // CHECK-LABEL: @test_vmlsl_laneq_s16(
 804 // CHECK-NEXT:  entry:
 805 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 806 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 807 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 808 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 809 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 810 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 811 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 812 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 813 //
 814 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 815   return vmlsl_laneq_s16(a, b, v, 7);
 816 }
 817
 818 // CHECK-LABEL: @test_vmlsl_laneq_s32(
 819 // CHECK-NEXT:  entry:
 820 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 821 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 822 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 823 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 824 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 825 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 826 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 827 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 828 //
 829 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 830   return vmlsl_laneq_s32(a, b, v, 3);
 831 }
 832
 833 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
 834 // CHECK-NEXT:  entry:
 835 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 836 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 837 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 838 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 839 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 840 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 841 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 842 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 843 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 844 //
 845 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 846   return vmlsl_high_lane_s16(a, b, v, 3);
 847 }
 848
 849 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
 850 // CHECK-NEXT:  entry:
 851 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
 852 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 853 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 854 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 855 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 856 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 857 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 858 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 859 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 860 //
 861 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 862   return vmlsl_high_lane_s32(a, b, v, 1);
 863 }
 864
 865 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 866 // CHECK-NEXT:  entry:
 867 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 868 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 869 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 870 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 871 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 872 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 873 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 874 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 875 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 876 //
 877 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 878   return vmlsl_high_laneq_s16(a, b, v, 7);
 879 }
 880
 881 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 882 // CHECK-NEXT:  entry:
 883 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
 884 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 885 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 886 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 887 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 888 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 889 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 890 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 891 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 892 //
 893 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 894   return vmlsl_high_laneq_s32(a, b, v, 3);
 895 }
 896
 897 // CHECK-LABEL: @test_vmlal_lane_u16(
 898 // CHECK-NEXT:  entry:
 899 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 900 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 901 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 902 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 903 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 904 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 905 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 906 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 907 //
 908 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 909   return vmlal_lane_u16(a, b, v, 3);
 910 }
 911
 912 // CHECK-LABEL: @test_vmlal_lane_u32(
 913 // CHECK-NEXT:  entry:
 914 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 915 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 916 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 917 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 918 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 919 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 920 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 921 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 922 //
 923 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 924   return vmlal_lane_u32(a, b, v, 1);
 925 }
 926
 927 // CHECK-LABEL: @test_vmlal_laneq_u16(
 928 // CHECK-NEXT:  entry:
 929 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 930 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 931 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 932 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
 933 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 934 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 935 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 936 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 937 //
 938 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 939   return vmlal_laneq_u16(a, b, v, 7);
 940 }
 941
 942 // CHECK-LABEL: @test_vmlal_laneq_u32(
 943 // CHECK-NEXT:  entry:
 944 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 945 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 946 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
 947 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
 948 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 949 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 950 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 951 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 952 //
 953 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 954   return vmlal_laneq_u32(a, b, v, 3);
 955 }
 956
 957 // CHECK-LABEL: @test_vmlal_high_lane_u16(
 958 // CHECK-NEXT:  entry:
 959 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 960 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
 961 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 962 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 963 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 964 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 965 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 966 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 967 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 968 //
 969 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 970   return vmlal_high_lane_u16(a, b, v, 3);
 971 }
 972
 973 // CHECK-LABEL: @test_vmlal_high_lane_u32(
 974 // CHECK-NEXT:  entry:
 975 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
 976 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
 977 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 978 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
 979 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 980 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
 981 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 982 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 983 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 984 //
 985 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 986   return vmlal_high_lane_u32(a, b, v, 1);
 987 }
 988
 989 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
 990 // CHECK-NEXT:  entry:
 991 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 992 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 993 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 994 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 995 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 996 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
 997 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 998 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 999 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1000 //
1001 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1002   return vmlal_high_laneq_u16(a, b, v, 7);
1003 }
1004
1005 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
1006 // CHECK-NEXT:  entry:
1007 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1008 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1009 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1010 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1011 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1012 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1013 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1014 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
1015 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
1016 //
1017 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1018   return vmlal_high_laneq_u32(a, b, v, 3);
1019 }
1020
1021 // CHECK-LABEL: @test_vmlsl_lane_u16(
1022 // CHECK-NEXT:  entry:
1023 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1024 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1025 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1026 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1027 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1028 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1029 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1030 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1031 //
1032 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
1033   return vmlsl_lane_u16(a, b, v, 3);
1034 }
1035
1036 // CHECK-LABEL: @test_vmlsl_lane_u32(
1037 // CHECK-NEXT:  entry:
1038 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1039 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1040 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1041 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1042 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1043 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1044 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1045 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1046 //
1047 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
1048   return vmlsl_lane_u32(a, b, v, 1);
1049 }
1050
1051 // CHECK-LABEL: @test_vmlsl_laneq_u16(
1052 // CHECK-NEXT:  entry:
1053 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1054 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1055 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1056 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1057 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1058 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1059 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1060 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1061 //
1062 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
1063   return vmlsl_laneq_u16(a, b, v, 7);
1064 }
1065
1066 // CHECK-LABEL: @test_vmlsl_laneq_u32(
1067 // CHECK-NEXT:  entry:
1068 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1069 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1070 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1071 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1072 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1073 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1074 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1075 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1076 //
1077 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
1078   return vmlsl_laneq_u32(a, b, v, 3);
1079 }
1080
1081 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
1082 // CHECK-NEXT:  entry:
1083 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1084 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1085 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1086 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1087 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1088 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1089 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1090 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1091 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1092 //
1093 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
1094   return vmlsl_high_lane_u16(a, b, v, 3);
1095 }
1096
1097 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
1098 // CHECK-NEXT:  entry:
1099 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1100 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1101 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1102 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1103 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1104 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1105 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1106 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1107 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1108 //
1109 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
1110   return vmlsl_high_lane_u32(a, b, v, 1);
1111 }
1112
1113 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
1114 // CHECK-NEXT:  entry:
1115 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1116 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1117 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1118 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1119 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1120 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1121 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1122 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1123 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
1124 //
1125 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1126   return vmlsl_high_laneq_u16(a, b, v, 7);
1127 }
1128
1129 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
1130 // CHECK-NEXT:  entry:
1131 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1132 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1133 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1134 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1135 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1136 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1137 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1138 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1139 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
1140 //
1141 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1142   return vmlsl_high_laneq_u32(a, b, v, 3);
1143 }
1144
1145 // CHECK-LABEL: @test_vmull_lane_s16(
1146 // CHECK-NEXT:  entry:
1147 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1148 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1149 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1150 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1151 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1152 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1153 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1154 //
1155 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
1156   return vmull_lane_s16(a, v, 3);
1157 }
1158
1159 // CHECK-LABEL: @test_vmull_lane_s32(
1160 // CHECK-NEXT:  entry:
1161 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1162 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1163 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1164 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1165 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1166 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1167 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1168 //
1169 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
1170   return vmull_lane_s32(a, v, 1);
1171 }
1172
1173 // CHECK-LABEL: @test_vmull_lane_u16(
1174 // CHECK-NEXT:  entry:
1175 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1176 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1177 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1178 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1179 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1180 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1181 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1182 //
1183 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
1184   return vmull_lane_u16(a, v, 3);
1185 }
1186
1187 // CHECK-LABEL: @test_vmull_lane_u32(
1188 // CHECK-NEXT:  entry:
1189 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1190 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1191 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1192 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1193 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1194 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1195 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1196 //
1197 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
1198   return vmull_lane_u32(a, v, 1);
1199 }
1200
1201 // CHECK-LABEL: @test_vmull_high_lane_s16(
1202 // CHECK-NEXT:  entry:
1203 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1204 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1205 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1206 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1207 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1208 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1209 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1210 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1211 //
1212 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1213   return vmull_high_lane_s16(a, v, 3);
1214 }
1215
1216 // CHECK-LABEL: @test_vmull_high_lane_s32(
1217 // CHECK-NEXT:  entry:
1218 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1219 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1220 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1221 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1222 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1223 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1224 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1225 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1226 //
1227 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1228   return vmull_high_lane_s32(a, v, 1);
1229 }
1230
1231 // CHECK-LABEL: @test_vmull_high_lane_u16(
1232 // CHECK-NEXT:  entry:
1233 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1234 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1235 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1236 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1237 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1238 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1239 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1240 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1241 //
1242 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1243   return vmull_high_lane_u16(a, v, 3);
1244 }
1245
1246 // CHECK-LABEL: @test_vmull_high_lane_u32(
1247 // CHECK-NEXT:  entry:
1248 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1249 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1250 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1251 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1252 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1253 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1254 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1255 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1256 //
1257 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1258   return vmull_high_lane_u32(a, v, 1);
1259 }
1260
1261 // CHECK-LABEL: @test_vmull_laneq_s16(
1262 // CHECK-NEXT:  entry:
1263 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1264 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1265 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1266 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1267 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1268 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1269 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1270 //
1271 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1272   return vmull_laneq_s16(a, v, 7);
1273 }
1274
1275 // CHECK-LABEL: @test_vmull_laneq_s32(
1276 // CHECK-NEXT:  entry:
1277 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1278 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1279 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1280 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1281 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1282 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1283 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1284 //
1285 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1286   return vmull_laneq_s32(a, v, 3);
1287 }
1288
1289 // CHECK-LABEL: @test_vmull_laneq_u16(
1290 // CHECK-NEXT:  entry:
1291 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1292 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1293 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1294 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1295 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1296 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1297 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1298 //
1299 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1300   return vmull_laneq_u16(a, v, 7);
1301 }
1302
1303 // CHECK-LABEL: @test_vmull_laneq_u32(
1304 // CHECK-NEXT:  entry:
1305 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1306 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1307 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1308 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1309 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1310 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1311 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1312 //
1313 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1314   return vmull_laneq_u32(a, v, 3);
1315 }
1316
1317 // CHECK-LABEL: @test_vmull_high_laneq_s16(
1318 // CHECK-NEXT:  entry:
1319 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1320 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1321 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1322 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1323 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1324 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1325 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1326 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1327 //
1328 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1329   return vmull_high_laneq_s16(a, v, 7);
1330 }
1331
1332 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1333 // CHECK-NEXT:  entry:
1334 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1335 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1336 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1337 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1338 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1339 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1340 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1341 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1342 //
1343 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1344   return vmull_high_laneq_s32(a, v, 3);
1345 }
1346
1347 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1348 // CHECK-NEXT:  entry:
1349 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1350 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1351 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1352 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1353 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1354 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1355 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1356 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
1357 //
1358 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1359   return vmull_high_laneq_u16(a, v, 7);
1360 }
1361
1362 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1363 // CHECK-NEXT:  entry:
1364 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1365 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1366 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1367 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1368 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1369 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1370 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1371 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
1372 //
1373 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1374   return vmull_high_laneq_u32(a, v, 3);
1375 }
1376
1377 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1378 // CHECK-NEXT:  entry:
1379 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1380 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1381 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1382 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1383 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1384 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1385 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1386 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1387 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1388 //
1389 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1390   return vqdmlal_lane_s16(a, b, v, 3);
1391 }
1392
1393 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1394 // CHECK-NEXT:  entry:
1395 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1396 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1397 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1398 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1399 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1400 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1401 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1402 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1403 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1404 //
1405 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1406   return vqdmlal_lane_s32(a, b, v, 1);
1407 }
1408
1409 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1410 // CHECK-NEXT:  entry:
1411 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1412 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1413 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1415 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1416 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1417 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1418 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1419 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1420 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
1421 //
1422 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1423   return vqdmlal_high_lane_s16(a, b, v, 3);
1424 }
1425
1426 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1427 // CHECK-NEXT:  entry:
1428 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1429 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1430 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1431 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1432 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1433 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1434 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1435 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1436 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1437 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
1438 //
1439 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1440   return vqdmlal_high_lane_s32(a, b, v, 1);
1441 }
1442
1443 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1444 // CHECK-NEXT:  entry:
1445 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1446 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1447 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1448 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1449 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1450 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1451 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1452 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1453 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1454 //
1455 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1456   return vqdmlsl_lane_s16(a, b, v, 3);
1457 }
1458
1459 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1460 // CHECK-NEXT:  entry:
1461 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1462 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1463 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1464 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1465 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1466 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1467 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1468 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1469 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1470 //
1471 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1472   return vqdmlsl_lane_s32(a, b, v, 1);
1473 }
1474
1475 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1476 // CHECK-NEXT:  entry:
1477 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1478 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1479 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1480 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1481 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1482 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1483 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1484 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1485 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1486 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
1487 //
1488 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1489   return vqdmlsl_high_lane_s16(a, b, v, 3);
1490 }
1491
1492 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1493 // CHECK-NEXT:  entry:
1494 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1495 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1496 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1497 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1498 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1499 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1500 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1501 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1502 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1503 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
1504 //
1505 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1506   return vqdmlsl_high_lane_s32(a, b, v, 1);
1507 }
1508
1509 // CHECK-LABEL: @test_vqdmull_lane_s16(
1510 // CHECK-NEXT:  entry:
1511 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1512 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1513 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1514 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1515 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1516 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1517 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1518 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1519 //
1520 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1521   return vqdmull_lane_s16(a, v, 3);
1522 }
1523
1524 // CHECK-LABEL: @test_vqdmull_lane_s32(
1525 // CHECK-NEXT:  entry:
1526 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1527 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1528 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1529 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1530 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1531 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1532 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1533 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1534 //
1535 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1536   return vqdmull_lane_s32(a, v, 1);
1537 }
1538
1539 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1540 // CHECK-NEXT:  entry:
1541 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1542 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1543 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1544 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1545 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1546 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1547 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1548 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1549 //
1550 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1551   return vqdmull_laneq_s16(a, v, 3);
1552 }
1553
1554 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1555 // CHECK-NEXT:  entry:
1556 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1557 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1558 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1559 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1560 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1561 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1562 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1563 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1564 //
1565 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1566   return vqdmull_laneq_s32(a, v, 3);
1567 }
1568
1569 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1570 // CHECK-NEXT:  entry:
1571 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1572 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1573 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1574 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1575 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1576 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1577 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1578 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1579 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1580 //
1581 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1582   return vqdmull_high_lane_s16(a, v, 3);
1583 }
1584
1585 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1586 // CHECK-NEXT:  entry:
1587 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1588 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1589 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1590 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1591 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1592 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1593 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1594 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1595 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1596 //
1597 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1598   return vqdmull_high_lane_s32(a, v, 1);
1599 }
1600
1601 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1602 // CHECK-NEXT:  entry:
1603 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1604 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1605 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1606 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1607 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1608 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1609 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1610 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1611 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
1612 //
1613 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1614   return vqdmull_high_laneq_s16(a, v, 7);
1615 }
1616
1617 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1618 // CHECK-NEXT:  entry:
1619 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1620 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1621 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1622 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1623 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1624 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1625 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1626 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1627 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
1628 //
1629 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1630   return vqdmull_high_laneq_s32(a, v, 3);
1631 }
1632
1633 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1634 // CHECK-NEXT:  entry:
1635 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1636 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1637 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1638 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1639 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
1640 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
1641 //
1642 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1643   return vqdmulh_lane_s16(a, v, 3);
1644 }
1645
1646 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1647 // CHECK-NEXT:  entry:
1648 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1649 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1650 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1651 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1652 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
1653 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
1654 //
1655 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1656   return vqdmulhq_lane_s16(a, v, 3);
1657 }
1658
1659 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1660 // CHECK-NEXT:  entry:
1661 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1662 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1663 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1664 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1665 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
1666 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
1667 //
1668 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1669   return vqdmulh_lane_s32(a, v, 1);
1670 }
1671
1672 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1673 // CHECK-NEXT:  entry:
1674 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1675 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1676 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1677 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1678 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
1679 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
1680 //
1681 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1682   return vqdmulhq_lane_s32(a, v, 1);
1683 }
1684
1685 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1686 // CHECK-NEXT:  entry:
1687 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1688 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1689 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1690 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1691 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
1692 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
1693 //
1694 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1695   return vqrdmulh_lane_s16(a, v, 3);
1696 }
1697
1698 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1699 // CHECK-NEXT:  entry:
1700 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1701 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1702 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1703 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1704 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
1705 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
1706 //
1707 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1708   return vqrdmulhq_lane_s16(a, v, 3);
1709 }
1710
1711 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1712 // CHECK-NEXT:  entry:
1713 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1714 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1715 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1716 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1717 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
1718 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
1719 //
1720 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1721   return vqrdmulh_lane_s32(a, v, 1);
1722 }
1723
1724 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1725 // CHECK-NEXT:  entry:
1726 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1727 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1728 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1729 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1730 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
1731 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
1732 //
1733 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1734   return vqrdmulhq_lane_s32(a, v, 1);
1735 }
1736
1737 // CHECK-LABEL: @test_vmul_lane_f32(
1738 // CHECK-NEXT:  entry:
1739 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1740 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1741 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1742 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1743 // CHECK-NEXT:    ret <2 x float> [[MUL]]
1744 //
1745 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1746   return vmul_lane_f32(a, v, 1);
1747 }
1748
1749
1750 // CHECK-LABEL: @test_vmul_lane_f64(
1751 // CHECK-NEXT:  entry:
1752 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1753 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1754 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1755 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1756 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1757 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1758 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1759 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1760 //
1761 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1762   return vmul_lane_f64(a, v, 0);
1763 }
1764
1765 // CHECK-LABEL: @test_vmulq_lane_f32(
1766 // CHECK-NEXT:  entry:
1767 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1768 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1769 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1770 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1771 // CHECK-NEXT:    ret <4 x float> [[MUL]]
1772 //
1773 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1774   return vmulq_lane_f32(a, v, 1);
1775 }
1776
1777 // CHECK-LABEL: @test_vmulq_lane_f64(
1778 // CHECK-NEXT:  entry:
1779 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1780 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1781 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1782 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1783 // CHECK-NEXT:    ret <2 x double> [[MUL]]
1784 //
1785 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1786   return vmulq_lane_f64(a, v, 0);
1787 }
1788
1789 // CHECK-LABEL: @test_vmul_laneq_f32(
1790 // CHECK-NEXT:  entry:
1791 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1792 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1793 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1794 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1795 // CHECK-NEXT:    ret <2 x float> [[MUL]]
1796 //
1797 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1798   return vmul_laneq_f32(a, v, 3);
1799 }
1800
1801 // CHECK-LABEL: @test_vmul_laneq_f64(
1802 // CHECK-NEXT:  entry:
1803 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1804 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1805 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1806 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1807 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1808 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1809 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1810 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
1811 //
1812 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1813   return vmul_laneq_f64(a, v, 1);
1814 }
1815
1816 // CHECK-LABEL: @test_vmulq_laneq_f32(
1817 // CHECK-NEXT:  entry:
1818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1820 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1821 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1822 // CHECK-NEXT:    ret <4 x float> [[MUL]]
1823 //
1824 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1825   return vmulq_laneq_f32(a, v, 3);
1826 }
1827
1828 // CHECK-LABEL: @test_vmulq_laneq_f64(
1829 // CHECK-NEXT:  entry:
1830 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1831 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1832 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1833 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1834 // CHECK-NEXT:    ret <2 x double> [[MUL]]
1835 //
1836 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1837   return vmulq_laneq_f64(a, v, 1);
1838 }
1839
1840 // CHECK-LABEL: @test_vmulx_lane_f32(
1841 // CHECK-NEXT:  entry:
1842 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1843 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1844 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1845 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1846 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1847 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1848 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1849 //
1850 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1851   return vmulx_lane_f32(a, v, 1);
1852 }
1853
1854 // CHECK-LABEL: @test_vmulxq_lane_f32(
1855 // CHECK-NEXT:  entry:
1856 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1857 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1858 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1859 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1860 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1861 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1862 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1863 //
1864 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1865   return vmulxq_lane_f32(a, v, 1);
1866 }
1867
1868 // CHECK-LABEL: @test_vmulxq_lane_f64(
1869 // CHECK-NEXT:  entry:
1870 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1871 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1872 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1873 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1874 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1875 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1876 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1877 //
1878 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1879   return vmulxq_lane_f64(a, v, 0);
1880 }
1881
1882 // CHECK-LABEL: @test_vmulx_laneq_f32(
1883 // CHECK-NEXT:  entry:
1884 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1885 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1886 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1887 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1888 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1889 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1890 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
1891 //
1892 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1893   return vmulx_laneq_f32(a, v, 3);
1894 }
1895
1896 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1897 // CHECK-NEXT:  entry:
1898 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1899 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1900 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1901 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1902 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1903 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1904 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
1905 //
1906 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1907   return vmulxq_laneq_f32(a, v, 3);
1908 }
1909
1910 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1911 // CHECK-NEXT:  entry:
1912 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1913 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1914 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1915 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1916 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1917 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1918 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
1919 //
1920 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1921   return vmulxq_laneq_f64(a, v, 1);
1922 }
1923
1924 // CHECK-LABEL: @test_vmla_lane_s16_0(
1925 // CHECK-NEXT:  entry:
1926 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1927 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1928 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
1929 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1930 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1931 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1932 //
1933 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1934   return vmla_lane_s16(a, b, v, 0);
1935 }
1936
1937 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1938 // CHECK-NEXT:  entry:
1939 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1940 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1941 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
1942 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1943 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1944 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1945 //
1946 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1947   return vmlaq_lane_s16(a, b, v, 0);
1948 }
1949
1950 // CHECK-LABEL: @test_vmla_lane_s32_0(
1951 // CHECK-NEXT:  entry:
1952 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1953 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1954 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
1955 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
1956 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
1957 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
1958 //
1959 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1960   return vmla_lane_s32(a, b, v, 0);
1961 }
1962
1963 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1964 // CHECK-NEXT:  entry:
1965 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1966 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1967 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
1968 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
1969 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
1970 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
1971 //
1972 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1973   return vmlaq_lane_s32(a, b, v, 0);
1974 }
1975
1976 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1977 // CHECK-NEXT:  entry:
1978 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1979 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1980 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
1981 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1982 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1983 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
1984 //
1985 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1986   return vmla_laneq_s16(a, b, v, 0);
1987 }
1988
1989 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1990 // CHECK-NEXT:  entry:
1991 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1992 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1993 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
1994 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1995 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1996 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
1997 //
1998 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1999   return vmlaq_laneq_s16(a, b, v, 0);
2000 }
2001
2002 // CHECK-LABEL: @test_vmla_laneq_s32_0(
2003 // CHECK-NEXT:  entry:
2004 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2005 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2006 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2007 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2008 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
2009 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
2010 //
2011 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2012   return vmla_laneq_s32(a, b, v, 0);
2013 }
2014
2015 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
2016 // CHECK-NEXT:  entry:
2017 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2018 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2019 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2020 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2021 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
2022 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2023 //
2024 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2025   return vmlaq_laneq_s32(a, b, v, 0);
2026 }
2027
2028 // CHECK-LABEL: @test_vmls_lane_s16_0(
2029 // CHECK-NEXT:  entry:
2030 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2031 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2032 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2033 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2034 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2035 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2036 //
2037 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
2038   return vmls_lane_s16(a, b, v, 0);
2039 }
2040
2041 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
2042 // CHECK-NEXT:  entry:
2043 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2044 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2045 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2046 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2047 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2048 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2049 //
2050 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
2051   return vmlsq_lane_s16(a, b, v, 0);
2052 }
2053
2054 // CHECK-LABEL: @test_vmls_lane_s32_0(
2055 // CHECK-NEXT:  entry:
2056 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2057 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2058 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2059 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2060 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2061 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2062 //
2063 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
2064   return vmls_lane_s32(a, b, v, 0);
2065 }
2066
2067 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
2068 // CHECK-NEXT:  entry:
2069 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2070 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2071 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2072 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2073 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2074 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2075 //
2076 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
2077   return vmlsq_lane_s32(a, b, v, 0);
2078 }
2079
2080 // CHECK-LABEL: @test_vmls_laneq_s16_0(
2081 // CHECK-NEXT:  entry:
2082 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2083 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2084 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2085 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2086 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2087 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
2088 //
2089 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
2090   return vmls_laneq_s16(a, b, v, 0);
2091 }
2092
2093 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
2094 // CHECK-NEXT:  entry:
2095 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2096 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2097 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2098 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2099 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2100 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
2101 //
2102 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
2103   return vmlsq_laneq_s16(a, b, v, 0);
2104 }
2105
2106 // CHECK-LABEL: @test_vmls_laneq_s32_0(
2107 // CHECK-NEXT:  entry:
2108 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2109 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2110 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2111 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2112 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2113 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
2114 //
2115 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2116   return vmls_laneq_s32(a, b, v, 0);
2117 }
2118
2119 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
2120 // CHECK-NEXT:  entry:
2121 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2122 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2123 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2124 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2125 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2126 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2127 //
2128 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2129   return vmlsq_laneq_s32(a, b, v, 0);
2130 }
2131
2132 // CHECK-LABEL: @test_vmul_lane_s16_0(
2133 // CHECK-NEXT:  entry:
2134 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2135 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2136 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2137 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2138 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2139 //
2140 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
2141   return vmul_lane_s16(a, v, 0);
2142 }
2143
2144 // CHECK-LABEL: @test_vmulq_lane_s16_0(
2145 // CHECK-NEXT:  entry:
2146 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2147 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2148 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2149 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2150 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2151 //
2152 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
2153   return vmulq_lane_s16(a, v, 0);
2154 }
2155
2156 // CHECK-LABEL: @test_vmul_lane_s32_0(
2157 // CHECK-NEXT:  entry:
2158 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2159 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2160 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2161 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2162 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2163 //
2164 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
2165   return vmul_lane_s32(a, v, 0);
2166 }
2167
2168 // CHECK-LABEL: @test_vmulq_lane_s32_0(
2169 // CHECK-NEXT:  entry:
2170 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2171 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2173 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2174 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2175 //
2176 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
2177   return vmulq_lane_s32(a, v, 0);
2178 }
2179
2180 // CHECK-LABEL: @test_vmul_lane_u16_0(
2181 // CHECK-NEXT:  entry:
2182 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2183 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2184 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2185 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2186 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2187 //
2188 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2189   return vmul_lane_u16(a, v, 0);
2190 }
2191
2192 // CHECK-LABEL: @test_vmulq_lane_u16_0(
2193 // CHECK-NEXT:  entry:
2194 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2195 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2196 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2197 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2198 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2199 //
2200 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2201   return vmulq_lane_u16(a, v, 0);
2202 }
2203
2204 // CHECK-LABEL: @test_vmul_lane_u32_0(
2205 // CHECK-NEXT:  entry:
2206 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2207 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2208 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2209 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2210 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2211 //
2212 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2213   return vmul_lane_u32(a, v, 0);
2214 }
2215
2216 // CHECK-LABEL: @test_vmulq_lane_u32_0(
2217 // CHECK-NEXT:  entry:
2218 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2219 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2220 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2221 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2222 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2223 //
2224 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2225   return vmulq_lane_u32(a, v, 0);
2226 }
2227
2228 // CHECK-LABEL: @test_vmul_laneq_s16_0(
2229 // CHECK-NEXT:  entry:
2230 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2231 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2232 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2233 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2234 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2235 //
2236 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
2237   return vmul_laneq_s16(a, v, 0);
2238 }
2239
2240 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
2241 // CHECK-NEXT:  entry:
2242 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2243 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2244 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2245 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2246 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2247 //
2248 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
2249   return vmulq_laneq_s16(a, v, 0);
2250 }
2251
2252 // CHECK-LABEL: @test_vmul_laneq_s32_0(
2253 // CHECK-NEXT:  entry:
2254 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2255 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2256 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2257 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2258 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2259 //
2260 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
2261   return vmul_laneq_s32(a, v, 0);
2262 }
2263
2264 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
2265 // CHECK-NEXT:  entry:
2266 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2267 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2268 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2269 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2270 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2271 //
2272 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
2273   return vmulq_laneq_s32(a, v, 0);
2274 }
2275
2276 // CHECK-LABEL: @test_vmul_laneq_u16_0(
2277 // CHECK-NEXT:  entry:
2278 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2279 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2280 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2281 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2282 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
2283 //
2284 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2285   return vmul_laneq_u16(a, v, 0);
2286 }
2287
2288 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
2289 // CHECK-NEXT:  entry:
2290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2292 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2293 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2294 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
2295 //
2296 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2297   return vmulq_laneq_u16(a, v, 0);
2298 }
2299
2300 // CHECK-LABEL: @test_vmul_laneq_u32_0(
2301 // CHECK-NEXT:  entry:
2302 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2303 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2304 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2305 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2306 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
2307 //
2308 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2309   return vmul_laneq_u32(a, v, 0);
2310 }
2311
2312 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
2313 // CHECK-NEXT:  entry:
2314 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2315 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2316 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2317 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2318 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
2319 //
2320 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2321   return vmulq_laneq_u32(a, v, 0);
2322 }
2323
2324 // CHECK-LABEL: @test_vfma_lane_f32_0(
2325 // CHECK-NEXT:  entry:
2326 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2327 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2328 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2329 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2330 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2331 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2332 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2333 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2334 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2335 //
2336 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2337   return vfma_lane_f32(a, b, v, 0);
2338 }
2339
2340 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
2341 // CHECK-NEXT:  entry:
2342 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2343 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2344 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2345 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2346 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2347 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2348 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2349 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2350 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2351 //
2352 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2353   return vfmaq_lane_f32(a, b, v, 0);
2354 }
2355
2356 // CHECK-LABEL: @test_vfma_laneq_f32_0(
2357 // CHECK-NEXT:  entry:
2358 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2359 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2360 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2361 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2362 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2363 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2365 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2366 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2367 //
2368 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2369   return vfma_laneq_f32(a, b, v, 0);
2370 }
2371
2372 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
2373 // CHECK-NEXT:  entry:
2374 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2375 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2376 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2377 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2378 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2379 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2380 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2381 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2382 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2383 //
2384 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2385   return vfmaq_laneq_f32(a, b, v, 0);
2386 }
2387
2388 // CHECK-LABEL: @test_vfms_lane_f32_0(
2389 // CHECK-NEXT:  entry:
2390 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2391 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2392 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2393 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2394 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2395 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2396 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2397 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2398 // CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2399 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
2400 //
2401 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2402   return vfms_lane_f32(a, b, v, 0);
2403 }
2404
2405 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
2406 // CHECK-NEXT:  entry:
2407 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2408 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2409 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2410 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2411 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2412 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2413 // CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2414 // CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2415 // CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2416 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
2417 //
2418 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2419   return vfmsq_lane_f32(a, b, v, 0);
2420 }
2421
2422 // CHECK-LABEL: @test_vfms_laneq_f32_0(
2423 // CHECK-NEXT:  entry:
2424 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2425 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2426 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2427 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2428 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2429 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2430 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2431 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2432 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2433 // CHECK-NEXT:    ret <2 x float> [[TMP6]]
2434 //
2435 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2436   return vfms_laneq_f32(a, b, v, 0);
2437 }
2438
2439 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
2440 // CHECK-NEXT:  entry:
2441 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2442 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2443 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2444 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2445 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2446 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2447 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2448 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2449 // CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2450 // CHECK-NEXT:    ret <4 x float> [[TMP6]]
2451 //
2452 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2453   return vfmsq_laneq_f32(a, b, v, 0);
2454 }
2455
2456 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
2457 // CHECK-NEXT:  entry:
2458 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2459 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
2460 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2461 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2462 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2463 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2464 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2465 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2466 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2467 //
2468 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2469   return vfmaq_laneq_f64(a, b, v, 0);
2470 }
2471
2472 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
2473 // CHECK-NEXT:  entry:
2474 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
2475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
2477 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2478 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2479 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2480 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2481 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2482 // CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2483 // CHECK-NEXT:    ret <2 x double> [[TMP6]]
2484 //
2485 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2486   return vfmsq_laneq_f64(a, b, v, 0);
2487 }
2488
2489 // CHECK-LABEL: @test_vmlal_lane_s16_0(
2490 // CHECK-NEXT:  entry:
2491 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2492 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2493 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2494 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2495 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2496 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2497 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2498 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2499 //
2500 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2501   return vmlal_lane_s16(a, b, v, 0);
2502 }
2503
2504 // CHECK-LABEL: @test_vmlal_lane_s32_0(
2505 // CHECK-NEXT:  entry:
2506 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2507 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2508 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2509 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2510 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2511 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2512 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2513 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2514 //
2515 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2516   return vmlal_lane_s32(a, b, v, 0);
2517 }
2518
2519 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
2520 // CHECK-NEXT:  entry:
2521 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2522 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2523 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2524 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2525 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2526 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2527 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2528 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2529 //
2530 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2531   return vmlal_laneq_s16(a, b, v, 0);
2532 }
2533
2534 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
2535 // CHECK-NEXT:  entry:
2536 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2537 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2538 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2539 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2540 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2541 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2542 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2543 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2544 //
2545 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2546   return vmlal_laneq_s32(a, b, v, 0);
2547 }
2548
2549 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
2550 // CHECK-NEXT:  entry:
2551 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2552 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2553 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2554 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2555 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2557 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2558 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2559 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2560 //
2561 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2562   return vmlal_high_lane_s16(a, b, v, 0);
2563 }
2564
2565 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
2566 // CHECK-NEXT:  entry:
2567 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2568 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2569 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2570 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2571 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2572 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2573 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2574 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2575 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2576 //
2577 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2578   return vmlal_high_lane_s32(a, b, v, 0);
2579 }
2580
2581 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
2582 // CHECK-NEXT:  entry:
2583 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2584 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2585 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2586 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2587 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2588 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2589 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2590 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2591 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2592 //
2593 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2594   return vmlal_high_laneq_s16(a, b, v, 0);
2595 }
2596
2597 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
2598 // CHECK-NEXT:  entry:
2599 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2600 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2601 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2602 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2603 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2604 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2605 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2606 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2607 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2608 //
2609 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2610   return vmlal_high_laneq_s32(a, b, v, 0);
2611 }
2612
2613 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
2614 // CHECK-NEXT:  entry:
2615 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2616 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2617 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2618 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2619 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2620 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2621 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2622 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2623 //
2624 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2625   return vmlsl_lane_s16(a, b, v, 0);
2626 }
2627
2628 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
2629 // CHECK-NEXT:  entry:
2630 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2631 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2632 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2633 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2634 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2635 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2636 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2637 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2638 //
2639 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2640   return vmlsl_lane_s32(a, b, v, 0);
2641 }
2642
2643 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
2644 // CHECK-NEXT:  entry:
2645 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2646 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2647 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2648 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2649 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2650 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2651 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2652 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2653 //
2654 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2655   return vmlsl_laneq_s16(a, b, v, 0);
2656 }
2657
2658 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
2659 // CHECK-NEXT:  entry:
2660 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2661 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2662 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2663 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2664 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2665 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2666 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2667 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2668 //
2669 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2670   return vmlsl_laneq_s32(a, b, v, 0);
2671 }
2672
2673 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2674 // CHECK-NEXT:  entry:
2675 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2676 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2677 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2678 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2679 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2680 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2681 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2682 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2683 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2684 //
2685 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2686   return vmlsl_high_lane_s16(a, b, v, 0);
2687 }
2688
2689 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2690 // CHECK-NEXT:  entry:
2691 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2692 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2693 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2694 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2695 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2696 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2697 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2698 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2699 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2700 //
2701 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2702   return vmlsl_high_lane_s32(a, b, v, 0);
2703 }
2704
2705 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2706 // CHECK-NEXT:  entry:
2707 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2708 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2709 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2710 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2711 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2712 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2713 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2714 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2715 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2716 //
2717 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2718   return vmlsl_high_laneq_s16(a, b, v, 0);
2719 }
2720
2721 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2722 // CHECK-NEXT:  entry:
2723 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2724 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2725 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2726 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2727 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2728 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2729 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2730 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2731 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2732 //
2733 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2734   return vmlsl_high_laneq_s32(a, b, v, 0);
2735 }
2736
2737 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2738 // CHECK-NEXT:  entry:
2739 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2740 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2741 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2742 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2743 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2744 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2745 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2746 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2747 //
2748 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2749   return vmlal_lane_u16(a, b, v, 0);
2750 }
2751
2752 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2753 // CHECK-NEXT:  entry:
2754 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2755 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2756 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2757 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2758 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2759 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2760 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2761 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2762 //
2763 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2764   return vmlal_lane_u32(a, b, v, 0);
2765 }
2766
2767 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2768 // CHECK-NEXT:  entry:
2769 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2770 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2771 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2772 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2773 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2774 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2775 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2776 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2777 //
2778 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2779   return vmlal_laneq_u16(a, b, v, 0);
2780 }
2781
2782 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2783 // CHECK-NEXT:  entry:
2784 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2785 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2786 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2787 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2788 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2789 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2790 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2791 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2792 //
2793 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2794   return vmlal_laneq_u32(a, b, v, 0);
2795 }
2796
2797 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2798 // CHECK-NEXT:  entry:
2799 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2800 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2801 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2802 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2803 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2804 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2805 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2806 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2807 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2808 //
2809 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2810   return vmlal_high_lane_u16(a, b, v, 0);
2811 }
2812
2813 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2814 // CHECK-NEXT:  entry:
2815 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2816 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2817 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2818 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2819 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2820 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2821 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2822 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2823 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2824 //
2825 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2826   return vmlal_high_lane_u32(a, b, v, 0);
2827 }
2828
2829 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2830 // CHECK-NEXT:  entry:
2831 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2832 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2833 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2834 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2835 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2836 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2837 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2838 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2839 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
2840 //
2841 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2842   return vmlal_high_laneq_u16(a, b, v, 0);
2843 }
2844
2845 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2846 // CHECK-NEXT:  entry:
2847 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2848 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2849 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2850 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2851 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2852 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2853 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2854 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2855 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
2856 //
2857 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2858   return vmlal_high_laneq_u32(a, b, v, 0);
2859 }
2860
2861 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2862 // CHECK-NEXT:  entry:
2863 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2864 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2865 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2866 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2867 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2868 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2869 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2870 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2871 //
2872 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2873   return vmlsl_lane_u16(a, b, v, 0);
2874 }
2875
2876 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2877 // CHECK-NEXT:  entry:
2878 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2879 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2880 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2881 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2882 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2883 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2884 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2885 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2886 //
2887 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2888   return vmlsl_lane_u32(a, b, v, 0);
2889 }
2890
2891 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2892 // CHECK-NEXT:  entry:
2893 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2894 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2895 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2896 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2897 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2898 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2899 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2900 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2901 //
2902 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2903   return vmlsl_laneq_u16(a, b, v, 0);
2904 }
2905
2906 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2907 // CHECK-NEXT:  entry:
2908 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2909 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2910 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2911 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2912 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2913 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2914 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2915 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2916 //
2917 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2918   return vmlsl_laneq_u32(a, b, v, 0);
2919 }
2920
2921 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2922 // CHECK-NEXT:  entry:
2923 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2924 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2925 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2926 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2927 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2928 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2929 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2930 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2931 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2932 //
2933 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2934   return vmlsl_high_lane_u16(a, b, v, 0);
2935 }
2936
2937 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2938 // CHECK-NEXT:  entry:
2939 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2940 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2941 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2942 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2943 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2944 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2945 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2946 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2947 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2948 //
2949 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2950   return vmlsl_high_lane_u32(a, b, v, 0);
2951 }
2952
2953 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2954 // CHECK-NEXT:  entry:
2955 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2956 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2957 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2958 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2959 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2960 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2961 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2962 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2963 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
2964 //
2965 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2966   return vmlsl_high_laneq_u16(a, b, v, 0);
2967 }
2968
2969 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2970 // CHECK-NEXT:  entry:
2971 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2972 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2973 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2974 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2975 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2976 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2977 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2978 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2979 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
2980 //
2981 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2982   return vmlsl_high_laneq_u32(a, b, v, 0);
2983 }
2984
2985 // CHECK-LABEL: @test_vmull_lane_s16_0(
2986 // CHECK-NEXT:  entry:
2987 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2988 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2989 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2990 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2991 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2992 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
2993 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
2994 //
2995 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2996   return vmull_lane_s16(a, v, 0);
2997 }
2998
2999 // CHECK-LABEL: @test_vmull_lane_s32_0(
3000 // CHECK-NEXT:  entry:
3001 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3002 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3003 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3004 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3005 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3006 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3007 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3008 //
3009 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3010   return vmull_lane_s32(a, v, 0);
3011 }
3012
3013 // CHECK-LABEL: @test_vmull_lane_u16_0(
3014 // CHECK-NEXT:  entry:
3015 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3016 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3017 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3018 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3019 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3020 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3021 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3022 //
3023 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
3024   return vmull_lane_u16(a, v, 0);
3025 }
3026
3027 // CHECK-LABEL: @test_vmull_lane_u32_0(
3028 // CHECK-NEXT:  entry:
3029 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3030 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3031 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3032 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3033 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3034 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3035 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3036 //
3037 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
3038   return vmull_lane_u32(a, v, 0);
3039 }
3040
3041 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
3042 // CHECK-NEXT:  entry:
3043 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3044 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3045 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3046 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3047 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3048 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3049 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3050 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3051 //
3052 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3053   return vmull_high_lane_s16(a, v, 0);
3054 }
3055
3056 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
3057 // CHECK-NEXT:  entry:
3058 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3059 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3060 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3061 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3062 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3063 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3064 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3065 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3066 //
3067 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3068   return vmull_high_lane_s32(a, v, 0);
3069 }
3070
3071 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
3072 // CHECK-NEXT:  entry:
3073 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3074 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3075 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3076 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3077 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3078 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3079 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3080 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3081 //
3082 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
3083   return vmull_high_lane_u16(a, v, 0);
3084 }
3085
3086 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
3087 // CHECK-NEXT:  entry:
3088 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3089 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3090 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3091 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3092 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3093 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3094 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3095 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3096 //
3097 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
3098   return vmull_high_lane_u32(a, v, 0);
3099 }
3100
3101 // CHECK-LABEL: @test_vmull_laneq_s16_0(
3102 // CHECK-NEXT:  entry:
3103 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3104 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3105 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3106 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3107 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3108 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3109 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3110 //
3111 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3112   return vmull_laneq_s16(a, v, 0);
3113 }
3114
3115 // CHECK-LABEL: @test_vmull_laneq_s32_0(
3116 // CHECK-NEXT:  entry:
3117 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3118 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3119 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3120 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3121 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3122 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3123 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3124 //
3125 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3126   return vmull_laneq_s32(a, v, 0);
3127 }
3128
3129 // CHECK-LABEL: @test_vmull_laneq_u16_0(
3130 // CHECK-NEXT:  entry:
3131 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3132 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3133 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3134 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3135 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3136 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3137 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3138 //
3139 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
3140   return vmull_laneq_u16(a, v, 0);
3141 }
3142
3143 // CHECK-LABEL: @test_vmull_laneq_u32_0(
3144 // CHECK-NEXT:  entry:
3145 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3146 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3147 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3148 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3149 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3150 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3151 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3152 //
3153 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
3154   return vmull_laneq_u32(a, v, 0);
3155 }
3156
3157 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
3158 // CHECK-NEXT:  entry:
3159 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3160 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3161 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3162 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3163 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3164 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3165 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3166 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3167 //
3168 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3169   return vmull_high_laneq_s16(a, v, 0);
3170 }
3171
3172 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
3173 // CHECK-NEXT:  entry:
3174 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3175 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3176 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3177 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3178 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3179 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3180 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3181 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3182 //
3183 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3184   return vmull_high_laneq_s32(a, v, 0);
3185 }
3186
3187 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
3188 // CHECK-NEXT:  entry:
3189 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3190 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3191 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3192 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3193 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3194 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3195 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3196 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
3197 //
3198 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
3199   return vmull_high_laneq_u16(a, v, 0);
3200 }
3201
3202 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
3203 // CHECK-NEXT:  entry:
3204 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3205 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3206 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3207 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3208 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3209 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3210 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3211 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
3212 //
3213 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
3214   return vmull_high_laneq_u32(a, v, 0);
3215 }
3216
3217 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
3218 // CHECK-NEXT:  entry:
3219 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3220 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3221 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3222 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3223 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3224 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3225 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3226 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3227 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3228 //
3229 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3230   return vqdmlal_lane_s16(a, b, v, 0);
3231 }
3232
3233 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
3234 // CHECK-NEXT:  entry:
3235 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3236 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3237 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3238 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3239 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3240 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3241 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3242 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3243 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3244 //
3245 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3246   return vqdmlal_lane_s32(a, b, v, 0);
3247 }
3248
3249 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
3250 // CHECK-NEXT:  entry:
3251 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3252 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3253 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3254 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3255 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3256 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3257 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3258 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3259 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3260 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
3261 //
3262 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3263   return vqdmlal_high_lane_s16(a, b, v, 0);
3264 }
3265
3266 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
3267 // CHECK-NEXT:  entry:
3268 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3269 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3270 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3271 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3272 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3273 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3274 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3275 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3276 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3277 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
3278 //
3279 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3280   return vqdmlal_high_lane_s32(a, b, v, 0);
3281 }
3282
3283 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
3284 // CHECK-NEXT:  entry:
3285 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3286 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3287 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3288 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3289 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3290 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3291 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3292 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3293 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3294 //
3295 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3296   return vqdmlsl_lane_s16(a, b, v, 0);
3297 }
3298
3299 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
3300 // CHECK-NEXT:  entry:
3301 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3302 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3303 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3304 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3305 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3306 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3307 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3308 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3309 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3310 //
3311 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3312   return vqdmlsl_lane_s32(a, b, v, 0);
3313 }
3314
3315 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
3316 // CHECK-NEXT:  entry:
3317 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3318 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3319 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3320 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3321 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3322 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3323 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3324 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3325 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3326 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
3327 //
3328 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3329   return vqdmlsl_high_lane_s16(a, b, v, 0);
3330 }
3331
3332 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
3333 // CHECK-NEXT:  entry:
3334 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3335 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3336 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3337 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3338 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3339 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3340 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3341 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3342 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3343 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
3344 //
3345 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3346   return vqdmlsl_high_lane_s32(a, b, v, 0);
3347 }
3348
3349 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
3350 // CHECK-NEXT:  entry:
3351 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3352 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3353 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3354 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3355 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3356 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3357 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3358 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3359 //
3360 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
3361   return vqdmull_lane_s16(a, v, 0);
3362 }
3363
3364 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
3365 // CHECK-NEXT:  entry:
3366 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3367 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3368 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3369 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3370 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3371 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3372 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3373 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3374 //
3375 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3376   return vqdmull_lane_s32(a, v, 0);
3377 }
3378
3379 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
3380 // CHECK-NEXT:  entry:
3381 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3382 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3383 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3384 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3385 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3386 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3387 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3388 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3389 //
3390 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3391   return vqdmull_laneq_s16(a, v, 0);
3392 }
3393
3394 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
3395 // CHECK-NEXT:  entry:
3396 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3397 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3398 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3399 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3400 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3401 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3402 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3403 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3404 //
3405 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3406   return vqdmull_laneq_s32(a, v, 0);
3407 }
3408
3409 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
3410 // CHECK-NEXT:  entry:
3411 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3412 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3413 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3414 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3415 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3416 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3417 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3418 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3419 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3420 //
3421 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3422   return vqdmull_high_lane_s16(a, v, 0);
3423 }
3424
3425 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
3426 // CHECK-NEXT:  entry:
3427 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3428 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3429 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3430 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3431 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3432 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3433 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3434 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3435 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3436 //
3437 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3438   return vqdmull_high_lane_s32(a, v, 0);
3439 }
3440
3441 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
3442 // CHECK-NEXT:  entry:
3443 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3444 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3445 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3446 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3447 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3448 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3449 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3450 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3451 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
3452 //
3453 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3454   return vqdmull_high_laneq_s16(a, v, 0);
3455 }
3456
3457 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
3458 // CHECK-NEXT:  entry:
3459 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3460 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3461 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3462 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3463 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3464 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3465 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3466 // CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3467 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
3468 //
3469 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3470   return vqdmull_high_laneq_s32(a, v, 0);
3471 }
3472
3473 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
3474 // CHECK-NEXT:  entry:
3475 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3476 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3477 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3478 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3479 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
3480 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
3481 //
3482 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3483   return vqdmulh_lane_s16(a, v, 0);
3484 }
3485
3486 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
3487 // CHECK-NEXT:  entry:
3488 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3489 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3490 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3491 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3492 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
3493 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
3494 //
3495 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3496   return vqdmulhq_lane_s16(a, v, 0);
3497 }
3498
3499 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
3500 // CHECK-NEXT:  entry:
3501 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3502 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3503 // CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3504 // CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3505 // CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
3506 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
3507 //
3508 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3509   return vqdmulh_lane_s32(a, v, 0);
3510 }
3511
3512 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
3513 // CHECK-NEXT:  entry:
3514 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3515 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3516 // CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3517 // CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3518 // CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
3519 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
3520 //
3521 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3522   return vqdmulhq_lane_s32(a, v, 0);
3523 }
3524
3525 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
3526 // CHECK-NEXT:  entry:
3527 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3528 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3529 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3530 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3531 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
3532 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
3533 //
3534 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3535   return vqrdmulh_lane_s16(a, v, 0);
3536 }
3537
3538 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
3539 // CHECK-NEXT:  entry:
3540 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3541 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3542 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3543 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3544 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
3545 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
3546 //
3547 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3548   return vqrdmulhq_lane_s16(a, v, 0);
3549 }
3550
3551 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
3552 // CHECK-NEXT:  entry:
3553 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3554 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3555 // CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3556 // CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3557 // CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
3558 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
3559 //
3560 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3561   return vqrdmulh_lane_s32(a, v, 0);
3562 }
3563
3564 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
3565 // CHECK-NEXT:  entry:
3566 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3567 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3568 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3569 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3570 // CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
3571 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
3572 //
3573 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3574   return vqrdmulhq_lane_s32(a, v, 0);
3575 }
3576
3577 // CHECK-LABEL: @test_vmul_lane_f32_0(
3578 // CHECK-NEXT:  entry:
3579 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3580 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3581 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3582 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3583 // CHECK-NEXT:    ret <2 x float> [[MUL]]
3584 //
3585 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3586   return vmul_lane_f32(a, v, 0);
3587 }
3588
3589 // CHECK-LABEL: @test_vmulq_lane_f32_0(
3590 // CHECK-NEXT:  entry:
3591 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3592 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3593 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3594 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3595 // CHECK-NEXT:    ret <4 x float> [[MUL]]
3596 //
3597 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3598   return vmulq_lane_f32(a, v, 0);
3599 }
3600
3601 // CHECK-LABEL: @test_vmul_laneq_f32_0(
3602 // CHECK-NEXT:  entry:
3603 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3604 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3605 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3606 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3607 // CHECK-NEXT:    ret <2 x float> [[MUL]]
3608 //
3609 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3610   return vmul_laneq_f32(a, v, 0);
3611 }
3612
3613 // CHECK-LABEL: @test_vmul_laneq_f64_0(
3614 // CHECK-NEXT:  entry:
3615 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
3616 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3617 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3618 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3619 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3620 // CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3621 // CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3622 // CHECK-NEXT:    ret <1 x double> [[TMP5]]
3623 //
3624 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3625   return vmul_laneq_f64(a, v, 0);
3626 }
3627
3628 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
3629 // CHECK-NEXT:  entry:
3630 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3631 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3632 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3633 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3634 // CHECK-NEXT:    ret <4 x float> [[MUL]]
3635 //
3636 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3637   return vmulq_laneq_f32(a, v, 0);
3638 }
3639
3640 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
3641 // CHECK-NEXT:  entry:
3642 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3643 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3644 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3645 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
3646 // CHECK-NEXT:    ret <2 x double> [[MUL]]
3647 //
3648 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3649   return vmulq_laneq_f64(a, v, 0);
3650 }
3651
3652 // CHECK-LABEL: @test_vmulx_lane_f32_0(
3653 // CHECK-NEXT:  entry:
3654 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3655 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3656 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3657 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3658 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3659 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3660 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3661 //
3662 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3663   return vmulx_lane_f32(a, v, 0);
3664 }
3665
3666 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
3667 // CHECK-NEXT:  entry:
3668 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3669 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3670 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3671 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3672 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3673 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3674 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3675 //
3676 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3677   return vmulxq_lane_f32(a, v, 0);
3678 }
3679
3680 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
3681 // CHECK-NEXT:  entry:
3682 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
3683 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
3684 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
3685 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3686 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3687 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3688 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3689 //
3690 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3691   return vmulxq_lane_f64(a, v, 0);
3692 }
3693
3694 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
3695 // CHECK-NEXT:  entry:
3696 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3697 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3698 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3699 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3700 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3701 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3702 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
3703 //
3704 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3705   return vmulx_laneq_f32(a, v, 0);
3706 }
3707
3708 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
3709 // CHECK-NEXT:  entry:
3710 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3711 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3712 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3713 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3714 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3715 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3716 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
3717 //
3718 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3719   return vmulxq_laneq_f32(a, v, 0);
3720 }
3721
3722 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
3723 // CHECK-NEXT:  entry:
3724 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3725 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3726 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3727 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3728 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3729 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3730 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
3731 //
3732 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3733   return vmulxq_laneq_f64(a, v, 0);
3734 }
3735
3736 // CHECK-LABEL: @test_vmull_high_n_s16(
3737 // CHECK-NEXT:  entry:
3738 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3739 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3740 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3741 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3742 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3743 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3744 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3745 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3746 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
3747 //
3748 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3749   return vmull_high_n_s16(a, b);
3750 }
3751
3752 // CHECK-LABEL: @test_vmull_high_n_s32(
3753 // CHECK-NEXT:  entry:
3754 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3755 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3756 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3757 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3758 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3759 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3760 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
3761 //
3762 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3763   return vmull_high_n_s32(a, b);
3764 }
3765
3766 // CHECK-LABEL: @test_vmull_high_n_u16(
3767 // CHECK-NEXT:  entry:
3768 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3769 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3770 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3771 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3772 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3773 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3774 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3775 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3776 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
3777 //
3778 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3779   return vmull_high_n_u16(a, b);
3780 }
3781
3782 // CHECK-LABEL: @test_vmull_high_n_u32(
3783 // CHECK-NEXT:  entry:
3784 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3785 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3786 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3787 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3788 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3789 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3790 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
3791 //
3792 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3793   return vmull_high_n_u32(a, b);
3794 }
3795
3796 // CHECK-LABEL: @test_vqdmull_high_n_s16(
3797 // CHECK-NEXT:  entry:
3798 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3799 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
3800 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3801 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3802 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3803 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3804 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3805 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3806 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
3807 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
3808 //
3809 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3810   return vqdmull_high_n_s16(a, b);
3811 }
3812
3813 // CHECK-LABEL: @test_vqdmull_high_n_s32(
3814 // CHECK-NEXT:  entry:
3815 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3816 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
3817 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3820 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3821 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
3822 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
3823 //
3824 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3825   return vqdmull_high_n_s32(a, b);
3826 }
3827
3828 // CHECK-LABEL: @test_vmlal_high_n_s16(
3829 // CHECK-NEXT:  entry:
3830 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3831 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3832 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3833 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3834 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3835 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3836 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3837 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3838 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3839 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
3840 //
3841 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3842   return vmlal_high_n_s16(a, b, c);
3843 }
3844
3845 // CHECK-LABEL: @test_vmlal_high_n_s32(
3846 // CHECK-NEXT:  entry:
3847 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3848 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3849 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3850 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3851 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3852 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3853 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3854 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
3855 //
3856 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3857   return vmlal_high_n_s32(a, b, c);
3858 }
3859
3860 // CHECK-LABEL: @test_vmlal_high_n_u16(
3861 // CHECK-NEXT:  entry:
3862 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3863 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3864 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3865 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3866 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3867 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3868 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3869 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3870 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3871 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
3872 //
3873 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3874   return vmlal_high_n_u16(a, b, c);
3875 }
3876
3877 // CHECK-LABEL: @test_vmlal_high_n_u32(
3878 // CHECK-NEXT:  entry:
3879 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3880 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3881 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3882 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3883 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3884 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3885 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3886 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
3887 //
3888 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3889   return vmlal_high_n_u32(a, b, c);
3890 }
3891
3892 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
3893 // CHECK-NEXT:  entry:
3894 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3895 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3896 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3897 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3898 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3899 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3900 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3901 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3902 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3903 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
3904 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
3905 //
3906 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3907   return vqdmlal_high_n_s16(a, b, c);
3908 }
3909
3910 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
3911 // CHECK-NEXT:  entry:
3912 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3913 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3914 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3915 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3916 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3917 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3918 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3919 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
3920 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
3921 //
3922 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3923   return vqdmlal_high_n_s32(a, b, c);
3924 }
3925
3926 // CHECK-LABEL: @test_vmlsl_high_n_s16(
3927 // CHECK-NEXT:  entry:
3928 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3929 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3930 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3931 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3932 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3933 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3934 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3935 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3936 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3937 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
3938 //
3939 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3940   return vmlsl_high_n_s16(a, b, c);
3941 }
3942
3943 // CHECK-LABEL: @test_vmlsl_high_n_s32(
3944 // CHECK-NEXT:  entry:
3945 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3946 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3947 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3948 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3949 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3950 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3951 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3952 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
3953 //
3954 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3955   return vmlsl_high_n_s32(a, b, c);
3956 }
3957
3958 // CHECK-LABEL: @test_vmlsl_high_n_u16(
3959 // CHECK-NEXT:  entry:
3960 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3961 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3962 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3963 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3964 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3965 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3966 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3967 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3968 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3969 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
3970 //
3971 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3972   return vmlsl_high_n_u16(a, b, c);
3973 }
3974
3975 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3976 // CHECK-NEXT:  entry:
3977 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3978 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
3979 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3980 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3981 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3982 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3983 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3984 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
3985 //
3986 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3987   return vmlsl_high_n_u32(a, b, c);
3988 }
3989
3990 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3991 // CHECK-NEXT:  entry:
3992 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3993 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
3994 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3995 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3996 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3997 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3998 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3999 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4000 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
4001 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4002 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
4003 //
4004 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
4005   return vqdmlsl_high_n_s16(a, b, c);
4006 }
4007
4008 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
4009 // CHECK-NEXT:  entry:
4010 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
4011 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4012 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4013 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4014 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
4015 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4016 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
4017 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4018 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
4019 //
4020 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
4021   return vqdmlsl_high_n_s32(a, b, c);
4022 }
4023
4024 // CHECK-LABEL: @test_vmul_n_f32(
4025 // CHECK-NEXT:  entry:
4026 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0
4027 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
4028 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
4029 // CHECK-NEXT:    ret <2 x float> [[MUL_I]]
4030 //
4031 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
4032   return vmul_n_f32(a, b);
4033 }
4034
4035 // CHECK-LABEL: @test_vmulq_n_f32(
4036 // CHECK-NEXT:  entry:
4037 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0
4038 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
4039 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
4040 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
4041 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
4042 // CHECK-NEXT:    ret <4 x float> [[MUL_I]]
4043 //
4044 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
4045   return vmulq_n_f32(a, b);
4046 }
4047
4048 // CHECK-LABEL: @test_vmulq_n_f64(
4049 // CHECK-NEXT:  entry:
4050 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
4051 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
4052 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
4053 // CHECK-NEXT:    ret <2 x double> [[MUL_I]]
4054 //
4055 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
4056   return vmulq_n_f64(a, b);
4057 }
4058
4059 // CHECK-LABEL: @test_vfma_n_f32(
4060 // CHECK-NEXT:  entry:
4061 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
4062 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4063 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4064 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
4065 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4066 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4067 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4068 //
4069 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4070   return vfma_n_f32(a, b, n);
4071 }
4072
4073 // CHECK-LABEL: @test_vfma_n_f64(
4074 // CHECK-NEXT:  entry:
4075 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
4076 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4077 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
4078 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4079 // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4080 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4081 //
4082 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4083   return vfma_n_f64(a, b, n);
4084 }
4085
4086 // CHECK-LABEL: @test_vfmaq_n_f32(
4087 // CHECK-NEXT:  entry:
4088 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
4089 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4090 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4091 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4092 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4093 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
4094 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4095 // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4096 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4097 //
4098 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4099   return vfmaq_n_f32(a, b, n);
4100 }
4101
4102 // CHECK-LABEL: @test_vfms_n_f32(
4103 // CHECK-NEXT:  entry:
4104 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
4105 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
4106 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4107 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4108 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
4109 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4110 // CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4111 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
4112 //
4113 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4114   return vfms_n_f32(a, b, n);
4115 }
4116
4117 // CHECK-LABEL: @test_vfms_n_f64(
4118 // CHECK-NEXT:  entry:
4119 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
4120 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
4121 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4122 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
4123 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4124 // CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4125 // CHECK-NEXT:    ret <1 x double> [[TMP3]]
4126 //
4127 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4128   return vfms_n_f64(a, b, n);
4129 }
4130
4131 // CHECK-LABEL: @test_vfmsq_n_f32(
4132 // CHECK-NEXT:  entry:
4133 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
4134 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
4135 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4136 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4137 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4138 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4139 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
4140 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4141 // CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4142 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
4143 //
4144 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4145   return vfmsq_n_f32(a, b, n);
4146 }
4147
4148 // CHECK-LABEL: @test_vmul_n_s16(
4149 // CHECK-NEXT:  entry:
4150 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4151 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4152 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4153 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4154 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4155 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4156 //
4157 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
4158   return vmul_n_s16(a, b);
4159 }
4160
4161 // CHECK-LABEL: @test_vmulq_n_s16(
4162 // CHECK-NEXT:  entry:
4163 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4164 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4165 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4166 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4167 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4168 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4169 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4170 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4171 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4172 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4173 //
4174 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
4175   return vmulq_n_s16(a, b);
4176 }
4177
4178 // CHECK-LABEL: @test_vmul_n_s32(
4179 // CHECK-NEXT:  entry:
4180 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4181 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4182 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4183 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4184 //
4185 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
4186   return vmul_n_s32(a, b);
4187 }
4188
4189 // CHECK-LABEL: @test_vmulq_n_s32(
4190 // CHECK-NEXT:  entry:
4191 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4192 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4193 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4194 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4195 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4196 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4197 //
4198 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
4199   return vmulq_n_s32(a, b);
4200 }
4201
4202 // CHECK-LABEL: @test_vmul_n_u16(
4203 // CHECK-NEXT:  entry:
4204 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4205 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4206 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4207 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4208 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4209 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
4210 //
4211 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
4212   return vmul_n_u16(a, b);
4213 }
4214
4215 // CHECK-LABEL: @test_vmulq_n_u16(
4216 // CHECK-NEXT:  entry:
4217 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4218 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4219 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4220 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4221 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4222 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4223 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4224 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4225 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4226 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
4227 //
4228 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
4229   return vmulq_n_u16(a, b);
4230 }
4231
4232 // CHECK-LABEL: @test_vmul_n_u32(
4233 // CHECK-NEXT:  entry:
4234 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4235 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4236 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4237 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
4238 //
4239 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
4240   return vmul_n_u32(a, b);
4241 }
4242
4243 // CHECK-LABEL: @test_vmulq_n_u32(
4244 // CHECK-NEXT:  entry:
4245 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4246 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4247 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4248 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4249 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4250 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
4251 //
4252 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
4253   return vmulq_n_u32(a, b);
4254 }
4255
4256 // CHECK-LABEL: @test_vmull_n_s16(
4257 // CHECK-NEXT:  entry:
4258 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4259 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4260 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4261 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4262 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4263 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4264 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4265 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4266 //
4267 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
4268   return vmull_n_s16(a, b);
4269 }
4270
4271 // CHECK-LABEL: @test_vmull_n_s32(
4272 // CHECK-NEXT:  entry:
4273 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4274 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4275 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4276 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4277 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4278 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4279 //
4280 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
4281   return vmull_n_s32(a, b);
4282 }
4283
4284 // CHECK-LABEL: @test_vmull_n_u16(
4285 // CHECK-NEXT:  entry:
4286 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4287 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4288 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4289 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4290 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4291 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4292 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4293 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4294 //
4295 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
4296   return vmull_n_u16(a, b);
4297 }
4298
4299 // CHECK-LABEL: @test_vmull_n_u32(
4300 // CHECK-NEXT:  entry:
4301 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4302 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4303 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4304 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4305 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4306 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4307 //
4308 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
4309   return vmull_n_u32(a, b);
4310 }
4311
4312 // CHECK-LABEL: @test_vqdmull_n_s16(
4313 // CHECK-NEXT:  entry:
4314 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4315 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4316 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4317 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4318 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4319 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4320 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4321 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
4322 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
4323 //
4324 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
4325   return vqdmull_n_s16(a, b);
4326 }
4327
4328 // CHECK-LABEL: @test_vqdmull_n_s32(
4329 // CHECK-NEXT:  entry:
4330 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4331 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4332 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4333 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4334 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4335 // CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
4336 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
4337 //
4338 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
4339   return vqdmull_n_s32(a, b);
4340 }
4341
4342 // CHECK-LABEL: @test_vqdmulh_n_s16(
4343 // CHECK-NEXT:  entry:
4344 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4345 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4346 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4347 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4348 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4349 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4350 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4351 // CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4352 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
4353 //
4354 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
4355   return vqdmulh_n_s16(a, b);
4356 }
4357
4358 // CHECK-LABEL: @test_vqdmulhq_n_s16(
4359 // CHECK-NEXT:  entry:
4360 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4361 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4362 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4363 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4364 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4365 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4366 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4367 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4368 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4369 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4370 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4371 // CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4372 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
4373 //
4374 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
4375   return vqdmulhq_n_s16(a, b);
4376 }
4377
4378 // CHECK-LABEL: @test_vqdmulh_n_s32(
4379 // CHECK-NEXT:  entry:
4380 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4381 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4382 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4383 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4384 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4385 // CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4386 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
4387 //
4388 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
4389   return vqdmulh_n_s32(a, b);
4390 }
4391
4392 // CHECK-LABEL: @test_vqdmulhq_n_s32(
4393 // CHECK-NEXT:  entry:
4394 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4395 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4396 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4397 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4398 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4399 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4400 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4401 // CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4402 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
4403 //
4404 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
4405   return vqdmulhq_n_s32(a, b);
4406 }
4407
4408 // CHECK-LABEL: @test_vqrdmulh_n_s16(
4409 // CHECK-NEXT:  entry:
4410 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
4411 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4412 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4413 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4414 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4415 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4416 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4417 // CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4418 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
4419 //
4420 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
4421   return vqrdmulh_n_s16(a, b);
4422 }
4423
4424 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
4425 // CHECK-NEXT:  entry:
4426 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
4427 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4428 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4429 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4430 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4431 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4432 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4433 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4434 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4435 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4436 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4437 // CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4438 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
4439 //
4440 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
4441   return vqrdmulhq_n_s16(a, b);
4442 }
4443
4444 // CHECK-LABEL: @test_vqrdmulh_n_s32(
4445 // CHECK-NEXT:  entry:
4446 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
4447 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4448 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4449 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4450 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4451 // CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4452 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
4453 //
4454 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
4455   return vqrdmulh_n_s32(a, b);
4456 }
4457
4458 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
4459 // CHECK-NEXT:  entry:
4460 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
4461 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4462 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4463 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4464 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4465 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4466 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4467 // CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4468 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
4469 //
4470 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
4471   return vqrdmulhq_n_s32(a, b);
4472 }
4473
4474 // CHECK-LABEL: @test_vmla_n_s16(
4475 // CHECK-NEXT:  entry:
4476 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4477 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4478 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4479 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4480 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4481 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4482 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4483 //
4484 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4485   return vmla_n_s16(a, b, c);
4486 }
4487
4488 // CHECK-LABEL: @test_vmlaq_n_s16(
4489 // CHECK-NEXT:  entry:
4490 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4491 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4492 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4493 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4494 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4495 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4496 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4497 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4498 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4499 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4500 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4501 //
4502 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4503   return vmlaq_n_s16(a, b, c);
4504 }
4505
4506 // CHECK-LABEL: @test_vmla_n_s32(
4507 // CHECK-NEXT:  entry:
4508 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4509 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4510 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4511 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4512 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4513 //
4514 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4515   return vmla_n_s32(a, b, c);
4516 }
4517
4518 // CHECK-LABEL: @test_vmlaq_n_s32(
4519 // CHECK-NEXT:  entry:
4520 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4521 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4522 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4523 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4524 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4525 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4526 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4527 //
4528 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4529   return vmlaq_n_s32(a, b, c);
4530 }
4531
4532 // CHECK-LABEL: @test_vmla_n_u16(
4533 // CHECK-NEXT:  entry:
4534 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4535 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4536 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4537 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4538 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4539 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4540 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
4541 //
4542 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4543   return vmla_n_u16(a, b, c);
4544 }
4545
4546 // CHECK-LABEL: @test_vmlaq_n_u16(
4547 // CHECK-NEXT:  entry:
4548 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4549 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4550 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4551 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4552 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4553 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4554 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4555 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4556 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4557 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4558 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4559 //
4560 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4561   return vmlaq_n_u16(a, b, c);
4562 }
4563
4564 // CHECK-LABEL: @test_vmla_n_u32(
4565 // CHECK-NEXT:  entry:
4566 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4567 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4568 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4569 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4570 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
4571 //
4572 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4573   return vmla_n_u32(a, b, c);
4574 }
4575
4576 // CHECK-LABEL: @test_vmlaq_n_u32(
4577 // CHECK-NEXT:  entry:
4578 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4579 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4580 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4581 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4582 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4583 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4584 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4585 //
4586 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4587   return vmlaq_n_u32(a, b, c);
4588 }
4589
4590 // CHECK-LABEL: @test_vmlal_n_s16(
4591 // CHECK-NEXT:  entry:
4592 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4593 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4594 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4595 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4596 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4597 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4598 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4599 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4600 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4601 //
4602 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4603   return vmlal_n_s16(a, b, c);
4604 }
4605
4606 // CHECK-LABEL: @test_vmlal_n_s32(
4607 // CHECK-NEXT:  entry:
4608 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4609 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4610 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4611 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4612 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4613 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4614 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4615 //
4616 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4617   return vmlal_n_s32(a, b, c);
4618 }
4619
4620 // CHECK-LABEL: @test_vmlal_n_u16(
4621 // CHECK-NEXT:  entry:
4622 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4623 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4624 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4625 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4626 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4627 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4628 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4629 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4630 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4631 //
4632 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4633   return vmlal_n_u16(a, b, c);
4634 }
4635
4636 // CHECK-LABEL: @test_vmlal_n_u32(
4637 // CHECK-NEXT:  entry:
4638 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4639 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4640 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4641 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4642 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4643 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4644 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4645 //
4646 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4647   return vmlal_n_u32(a, b, c);
4648 }
4649
4650 // CHECK-LABEL: @test_vqdmlal_n_s16(
4651 // CHECK-NEXT:  entry:
4652 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4653 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4654 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4655 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4656 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4657 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4658 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4659 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4660 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4661 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
4662 //
4663 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4664   return vqdmlal_n_s16(a, b, c);
4665 }
4666
4667 // CHECK-LABEL: @test_vqdmlal_n_s32(
4668 // CHECK-NEXT:  entry:
4669 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4670 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4671 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4672 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4673 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4674 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4675 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4676 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
4677 //
4678 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4679   return vqdmlal_n_s32(a, b, c);
4680 }
4681
4682 // CHECK-LABEL: @test_vmls_n_s16(
4683 // CHECK-NEXT:  entry:
4684 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4685 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4686 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4687 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4688 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4689 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4690 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4691 //
4692 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4693   return vmls_n_s16(a, b, c);
4694 }
4695
4696 // CHECK-LABEL: @test_vmlsq_n_s16(
4697 // CHECK-NEXT:  entry:
4698 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4699 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4700 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4701 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4702 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4703 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4704 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4705 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4706 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4707 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4708 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4709 //
4710 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4711   return vmlsq_n_s16(a, b, c);
4712 }
4713
4714 // CHECK-LABEL: @test_vmls_n_s32(
4715 // CHECK-NEXT:  entry:
4716 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4717 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4718 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4719 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4720 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4721 //
4722 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4723   return vmls_n_s32(a, b, c);
4724 }
4725
4726 // CHECK-LABEL: @test_vmlsq_n_s32(
4727 // CHECK-NEXT:  entry:
4728 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4729 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4730 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4731 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4732 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4733 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4734 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4735 //
4736 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4737   return vmlsq_n_s32(a, b, c);
4738 }
4739
4740 // CHECK-LABEL: @test_vmls_n_u16(
4741 // CHECK-NEXT:  entry:
4742 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4743 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4744 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4745 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4746 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4747 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4748 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
4749 //
4750 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4751   return vmls_n_u16(a, b, c);
4752 }
4753
4754 // CHECK-LABEL: @test_vmlsq_n_u16(
4755 // CHECK-NEXT:  entry:
4756 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
4757 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4758 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4759 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4760 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4761 // CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4762 // CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4763 // CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4764 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4765 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4766 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
4767 //
4768 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4769   return vmlsq_n_u16(a, b, c);
4770 }
4771
4772 // CHECK-LABEL: @test_vmls_n_u32(
4773 // CHECK-NEXT:  entry:
4774 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4775 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4776 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4777 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4778 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
4779 //
4780 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4781   return vmls_n_u32(a, b, c);
4782 }
4783
4784 // CHECK-LABEL: @test_vmlsq_n_u32(
4785 // CHECK-NEXT:  entry:
4786 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
4787 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4788 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4789 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4790 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4791 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4792 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4793 //
4794 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4795   return vmlsq_n_u32(a, b, c);
4796 }
4797
4798 // CHECK-LABEL: @test_vmlsl_n_s16(
4799 // CHECK-NEXT:  entry:
4800 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4801 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4802 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4803 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4804 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4805 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4806 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4807 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4808 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4809 //
4810 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4811   return vmlsl_n_s16(a, b, c);
4812 }
4813
4814 // CHECK-LABEL: @test_vmlsl_n_s32(
4815 // CHECK-NEXT:  entry:
4816 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4817 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4818 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4819 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4820 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4821 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4822 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4823 //
4824 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4825   return vmlsl_n_s32(a, b, c);
4826 }
4827
4828 // CHECK-LABEL: @test_vmlsl_n_u16(
4829 // CHECK-NEXT:  entry:
4830 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4831 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4832 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4833 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4834 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4835 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4836 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4837 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4838 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
4839 //
4840 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4841   return vmlsl_n_u16(a, b, c);
4842 }
4843
4844 // CHECK-LABEL: @test_vmlsl_n_u32(
4845 // CHECK-NEXT:  entry:
4846 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4847 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4848 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4849 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4850 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4851 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4852 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
4853 //
4854 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4855   return vmlsl_n_u32(a, b, c);
4856 }
4857
4858 // CHECK-LABEL: @test_vqdmlsl_n_s16(
4859 // CHECK-NEXT:  entry:
4860 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
4861 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4862 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4863 // CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4864 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4865 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4866 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4867 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4868 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4869 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
4870 //
4871 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4872   return vqdmlsl_n_s16(a, b, c);
4873 }
4874
4875 // CHECK-LABEL: @test_vqdmlsl_n_s32(
4876 // CHECK-NEXT:  entry:
4877 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
4878 // CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4879 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4880 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4881 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4882 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4883 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4884 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
4885 //
4886 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4887   return vqdmlsl_n_s32(a, b, c);
4888 }
4889
4890 // CHECK-LABEL: @test_vmla_lane_u16_0(
4891 // CHECK-NEXT:  entry:
4892 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4893 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4894 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
4895 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4896 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4897 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4898 //
4899 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4900   return vmla_lane_u16(a, b, v, 0);
4901 }
4902
4903 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
4904 // CHECK-NEXT:  entry:
4905 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4906 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4907 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
4908 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4909 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4910 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4911 //
4912 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4913   return vmlaq_lane_u16(a, b, v, 0);
4914 }
4915
4916 // CHECK-LABEL: @test_vmla_lane_u32_0(
4917 // CHECK-NEXT:  entry:
4918 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4919 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4920 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
4921 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4922 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4923 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4924 //
4925 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4926   return vmla_lane_u32(a, b, v, 0);
4927 }
4928
4929 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
4930 // CHECK-NEXT:  entry:
4931 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4932 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
4934 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4935 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4936 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4937 //
4938 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4939   return vmlaq_lane_u32(a, b, v, 0);
4940 }
4941
4942 // CHECK-LABEL: @test_vmla_laneq_u16_0(
4943 // CHECK-NEXT:  entry:
4944 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4945 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4946 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4947 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4948 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4949 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
4950 //
4951 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4952   return vmla_laneq_u16(a, b, v, 0);
4953 }
4954
4955 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
4956 // CHECK-NEXT:  entry:
4957 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4958 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4959 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
4960 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4961 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4962 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
4963 //
4964 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4965   return vmlaq_laneq_u16(a, b, v, 0);
4966 }
4967
4968 // CHECK-LABEL: @test_vmla_laneq_u32_0(
4969 // CHECK-NEXT:  entry:
4970 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4971 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4972 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
4973 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4974 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4975 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
4976 //
4977 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4978   return vmla_laneq_u32(a, b, v, 0);
4979 }
4980
4981 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
4982 // CHECK-NEXT:  entry:
4983 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4984 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4985 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
4986 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4987 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4988 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
4989 //
4990 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4991   return vmlaq_laneq_u32(a, b, v, 0);
4992 }
4993
4994 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
4995 // CHECK-NEXT:  entry:
4996 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4997 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4998 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4999 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5000 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5001 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5002 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5003 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5004 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5005 //
5006 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5007   return vqdmlal_laneq_s16(a, b, v, 0);
5008 }
5009
5010 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
5011 // CHECK-NEXT:  entry:
5012 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5013 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5014 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5015 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5016 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5017 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5018 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5019 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5020 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5021 //
5022 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5023   return vqdmlal_laneq_s32(a, b, v, 0);
5024 }
5025
5026 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
5027 // CHECK-NEXT:  entry:
5028 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5029 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5030 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5031 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5032 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5033 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5034 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5035 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5036 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5037 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5038 //
5039 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5040   return vqdmlal_high_laneq_s16(a, b, v, 0);
5041 }
5042
5043 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
5044 // CHECK-NEXT:  entry:
5045 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5046 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5047 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5048 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5049 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5050 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5051 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5052 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5053 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5054 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5055 //
5056 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5057   return vqdmlal_high_laneq_s32(a, b, v, 0);
5058 }
5059
5060 // CHECK-LABEL: @test_vmls_lane_u16_0(
5061 // CHECK-NEXT:  entry:
5062 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5063 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5064 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
5065 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5066 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5067 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5068 //
5069 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5070   return vmls_lane_u16(a, b, v, 0);
5071 }
5072
5073 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
5074 // CHECK-NEXT:  entry:
5075 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5076 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5077 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
5078 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5079 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5080 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5081 //
5082 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5083   return vmlsq_lane_u16(a, b, v, 0);
5084 }
5085
5086 // CHECK-LABEL: @test_vmls_lane_u32_0(
5087 // CHECK-NEXT:  entry:
5088 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5089 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5090 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
5091 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5092 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5093 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5094 //
5095 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5096   return vmls_lane_u32(a, b, v, 0);
5097 }
5098
5099 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
5100 // CHECK-NEXT:  entry:
5101 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5102 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5103 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
5104 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5105 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5106 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5107 //
5108 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5109   return vmlsq_lane_u32(a, b, v, 0);
5110 }
5111
5112 // CHECK-LABEL: @test_vmls_laneq_u16_0(
5113 // CHECK-NEXT:  entry:
5114 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5115 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5116 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5117 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5118 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5119 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5120 //
5121 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5122   return vmls_laneq_u16(a, b, v, 0);
5123 }
5124
5125 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
5126 // CHECK-NEXT:  entry:
5127 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5128 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5129 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
5130 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5131 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5132 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5133 //
5134 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5135   return vmlsq_laneq_u16(a, b, v, 0);
5136 }
5137
5138 // CHECK-LABEL: @test_vmls_laneq_u32_0(
5139 // CHECK-NEXT:  entry:
5140 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5141 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5142 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5143 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5144 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5145 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5146 //
5147 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5148   return vmls_laneq_u32(a, b, v, 0);
5149 }
5150
5151 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
5152 // CHECK-NEXT:  entry:
5153 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5154 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5155 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
5156 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5157 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5158 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5159 //
5160 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5161   return vmlsq_laneq_u32(a, b, v, 0);
5162 }
5163
5164 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
5165 // CHECK-NEXT:  entry:
5166 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5167 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5168 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5169 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5170 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5171 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5172 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5173 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5174 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5175 //
5176 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5177   return vqdmlsl_laneq_s16(a, b, v, 0);
5178 }
5179
5180 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
5181 // CHECK-NEXT:  entry:
5182 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5183 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5184 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5185 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5186 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5187 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5188 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5189 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5190 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5191 //
5192 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5193   return vqdmlsl_laneq_s32(a, b, v, 0);
5194 }
5195
5196 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
5197 // CHECK-NEXT:  entry:
5198 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5199 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5200 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5201 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5202 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5203 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5204 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5205 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5206 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5207 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5208 //
5209 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5210   return vqdmlsl_high_laneq_s16(a, b, v, 0);
5211 }
5212
5213 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
5214 // CHECK-NEXT:  entry:
5215 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5216 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5217 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5218 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5219 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5220 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5221 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5222 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5223 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5224 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5225 //
5226 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5227   return vqdmlsl_high_laneq_s32(a, b, v, 0);
5228 }
5229
5230 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
5231 // CHECK-NEXT:  entry:
5232 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5233 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5234 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5235 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5236 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
5237 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5238 //
5239 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5240   return vqdmulh_laneq_s16(a, v, 0);
5241 }
5242
5243 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
5244 // CHECK-NEXT:  entry:
5245 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5246 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5247 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5248 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5249 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
5250 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5251 //
5252 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5253   return vqdmulhq_laneq_s16(a, v, 0);
5254 }
5255
5256 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
5257 // CHECK-NEXT:  entry:
5258 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5259 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5260 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5261 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5262 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
5263 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5264 //
5265 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5266   return vqdmulh_laneq_s32(a, v, 0);
5267 }
5268
5269 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
5270 // CHECK-NEXT:  entry:
5271 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5272 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5273 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5274 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5275 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
5276 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5277 //
5278 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5279   return vqdmulhq_laneq_s32(a, v, 0);
5280 }
5281
5282 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
5283 // CHECK-NEXT:  entry:
5284 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5285 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5286 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5287 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5288 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
5289 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5290 //
5291 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5292   return vqrdmulh_laneq_s16(a, v, 0);
5293 }
5294
5295 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
5296 // CHECK-NEXT:  entry:
5297 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5298 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5299 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5300 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5301 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5302 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5303 //
5304 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5305   return vqrdmulhq_laneq_s16(a, v, 0);
5306 }
5307
5308 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
5309 // CHECK-NEXT:  entry:
5310 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5311 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5312 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5313 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5314 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
5315 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5316 //
5317 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5318   return vqrdmulh_laneq_s32(a, v, 0);
5319 }
5320
5321 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
5322 // CHECK-NEXT:  entry:
5323 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5324 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5325 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5326 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5327 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5328 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5329 //
5330 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5331   return vqrdmulhq_laneq_s32(a, v, 0);
5332 }
5333
5334 // CHECK-LABEL: @test_vmla_lane_u16(
5335 // CHECK-NEXT:  entry:
5336 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5337 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5338 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5339 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5340 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5341 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5342 //
5343 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5344   return vmla_lane_u16(a, b, v, 3);
5345 }
5346
5347 // CHECK-LABEL: @test_vmlaq_lane_u16(
5348 // CHECK-NEXT:  entry:
5349 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5350 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5351 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5352 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5353 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5354 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5355 //
5356 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5357   return vmlaq_lane_u16(a, b, v, 3);
5358 }
5359
5360 // CHECK-LABEL: @test_vmla_lane_u32(
5361 // CHECK-NEXT:  entry:
5362 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5363 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5364 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5365 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5366 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5367 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5368 //
5369 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5370   return vmla_lane_u32(a, b, v, 1);
5371 }
5372
5373 // CHECK-LABEL: @test_vmlaq_lane_u32(
5374 // CHECK-NEXT:  entry:
5375 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5376 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5377 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5378 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5379 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5380 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5381 //
5382 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5383   return vmlaq_lane_u32(a, b, v, 1);
5384 }
5385
5386 // CHECK-LABEL: @test_vmla_laneq_u16(
5387 // CHECK-NEXT:  entry:
5388 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5389 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5390 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5391 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5392 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5393 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
5394 //
5395 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5396   return vmla_laneq_u16(a, b, v, 7);
5397 }
5398
5399 // CHECK-LABEL: @test_vmlaq_laneq_u16(
5400 // CHECK-NEXT:  entry:
5401 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5402 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5403 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5404 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5405 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5406 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
5407 //
5408 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5409   return vmlaq_laneq_u16(a, b, v, 7);
5410 }
5411
5412 // CHECK-LABEL: @test_vmla_laneq_u32(
5413 // CHECK-NEXT:  entry:
5414 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5415 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5416 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5417 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5418 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5419 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
5420 //
5421 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5422   return vmla_laneq_u32(a, b, v, 3);
5423 }
5424
5425 // CHECK-LABEL: @test_vmlaq_laneq_u32(
5426 // CHECK-NEXT:  entry:
5427 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5428 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5429 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5430 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5431 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5432 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
5433 //
5434 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5435   return vmlaq_laneq_u32(a, b, v, 3);
5436 }
5437
5438 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
5439 // CHECK-NEXT:  entry:
5440 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5441 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5442 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5443 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5444 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5445 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5446 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5447 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5448 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5449 //
5450 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5451   return vqdmlal_laneq_s16(a, b, v, 7);
5452 }
5453
5454 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
5455 // CHECK-NEXT:  entry:
5456 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5457 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5458 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5459 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5460 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5461 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5462 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5463 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5464 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5465 //
5466 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5467   return vqdmlal_laneq_s32(a, b, v, 3);
5468 }
5469
5470 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
5471 // CHECK-NEXT:  entry:
5472 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5473 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5474 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5475 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5476 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5477 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5478 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5479 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5480 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5481 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
5482 //
5483 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5484   return vqdmlal_high_laneq_s16(a, b, v, 7);
5485 }
5486
5487 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
5488 // CHECK-NEXT:  entry:
5489 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5490 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5491 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5492 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5493 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5494 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5495 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5496 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5497 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5498 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
5499 //
5500 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5501   return vqdmlal_high_laneq_s32(a, b, v, 3);
5502 }
5503
5504 // CHECK-LABEL: @test_vmls_lane_u16(
5505 // CHECK-NEXT:  entry:
5506 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5507 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5508 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5509 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5510 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5511 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5512 //
5513 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5514   return vmls_lane_u16(a, b, v, 3);
5515 }
5516
5517 // CHECK-LABEL: @test_vmlsq_lane_u16(
5518 // CHECK-NEXT:  entry:
5519 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5520 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5521 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5522 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5523 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5524 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5525 //
5526 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5527   return vmlsq_lane_u16(a, b, v, 3);
5528 }
5529
5530 // CHECK-LABEL: @test_vmls_lane_u32(
5531 // CHECK-NEXT:  entry:
5532 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5533 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5534 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5535 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5536 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5537 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5538 //
5539 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5540   return vmls_lane_u32(a, b, v, 1);
5541 }
5542
5543 // CHECK-LABEL: @test_vmlsq_lane_u32(
5544 // CHECK-NEXT:  entry:
5545 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5546 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5547 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5548 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5549 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5550 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5551 //
5552 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5553   return vmlsq_lane_u32(a, b, v, 1);
5554 }
5555
5556 // CHECK-LABEL: @test_vmls_laneq_u16(
5557 // CHECK-NEXT:  entry:
5558 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5559 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5560 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5561 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5562 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5563 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
5564 //
5565 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5566   return vmls_laneq_u16(a, b, v, 7);
5567 }
5568
5569 // CHECK-LABEL: @test_vmlsq_laneq_u16(
5570 // CHECK-NEXT:  entry:
5571 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5572 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5573 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5574 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5575 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5576 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
5577 //
5578 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5579   return vmlsq_laneq_u16(a, b, v, 7);
5580 }
5581
5582 // CHECK-LABEL: @test_vmls_laneq_u32(
5583 // CHECK-NEXT:  entry:
5584 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5585 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5586 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5587 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5588 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5589 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
5590 //
5591 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5592   return vmls_laneq_u32(a, b, v, 3);
5593 }
5594
5595 // CHECK-LABEL: @test_vmlsq_laneq_u32(
5596 // CHECK-NEXT:  entry:
5597 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5598 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5599 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5600 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5601 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5602 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
5603 //
5604 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5605   return vmlsq_laneq_u32(a, b, v, 3);
5606 }
5607
5608 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
5609 // CHECK-NEXT:  entry:
5610 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5611 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5612 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5613 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5614 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5615 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5616 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5617 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5618 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5619 //
5620 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5621   return vqdmlsl_laneq_s16(a, b, v, 7);
5622 }
5623
5624 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
5625 // CHECK-NEXT:  entry:
5626 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5627 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5628 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5629 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5630 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5631 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5632 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5633 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5634 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5635 //
5636 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5637   return vqdmlsl_laneq_s32(a, b, v, 3);
5638 }
5639
5640 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
5641 // CHECK-NEXT:  entry:
5642 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5643 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5644 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5645 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5646 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5647 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5648 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5649 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5650 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5651 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
5652 //
5653 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5654   return vqdmlsl_high_laneq_s16(a, b, v, 7);
5655 }
5656
5657 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
5658 // CHECK-NEXT:  entry:
5659 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5660 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5661 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5662 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5663 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5664 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5665 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5666 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5667 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5668 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
5669 //
5670 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5671   return vqdmlsl_high_laneq_s32(a, b, v, 3);
5672 }
5673
5674 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
5675 // CHECK-NEXT:  entry:
5676 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5677 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5678 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5679 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5680 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
5681 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5682 //
5683 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5684   return vqdmulh_laneq_s16(a, v, 7);
5685 }
5686
5687 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
5688 // CHECK-NEXT:  entry:
5689 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5690 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5691 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5692 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5693 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
5694 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5695 //
5696 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5697   return vqdmulhq_laneq_s16(a, v, 7);
5698 }
5699
5700 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
5701 // CHECK-NEXT:  entry:
5702 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5703 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5704 // CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5705 // CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5706 // CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
5707 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5708 //
5709 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5710   return vqdmulh_laneq_s32(a, v, 3);
5711 }
5712
5713 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
5714 // CHECK-NEXT:  entry:
5715 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5716 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5717 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5718 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5719 // CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
5720 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5721 //
5722 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5723   return vqdmulhq_laneq_s32(a, v, 3);
5724 }
5725
5726 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
5727 // CHECK-NEXT:  entry:
5728 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5729 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5730 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5731 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5732 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
5733 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5734 //
5735 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5736   return vqrdmulh_laneq_s16(a, v, 7);
5737 }
5738
5739 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
5740 // CHECK-NEXT:  entry:
5741 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5742 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5743 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5744 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5745 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
5746 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5747 //
5748 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5749   return vqrdmulhq_laneq_s16(a, v, 7);
5750 }
5751
5752 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
5753 // CHECK-NEXT:  entry:
5754 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5755 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5756 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5757 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5758 // CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
5759 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5760 //
5761 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5762   return vqrdmulh_laneq_s32(a, v, 3);
5763 }
5764
5765 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
5766 // CHECK-NEXT:  entry:
5767 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5768 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5769 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5770 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5771 // CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
5772 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5773 //
5774 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5775   return vqrdmulhq_laneq_s32(a, v, 3);
5776 }