clang/test/CodeGen/AArch64/neon-vcmla.c

   1 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
   2 // RUN:        -target-feature +v8.3a \
   3 // RUN:        -target-feature +fullfp16 \
   4 // RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
   5
   6 // REQUIRES: aarch64-registered-target
   7
   8 #include <arm_neon.h>
   9
  10 // CHECK-LABEL: @test_vcmla_f16(
  11 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
  12 // CHECK: ret <4 x half> [[RES]]
  13 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  14   return vcmla_f16(acc, lhs, rhs);
  15 }
  16
  17 // CHECK-LABEL: @test_vcmla_f32(
  18 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
  19 // CHECK: ret <2 x float> [[RES]]
  20 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  21   return vcmla_f32(acc, lhs, rhs);
  22 }
  23
  24 // CHECK-LABEL: @test_vcmlaq_f16(
  25 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
  26 // CHECK: ret <8 x half> [[RES]]
  27 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  28   return vcmlaq_f16(acc, lhs, rhs);
  29 }
  30
  31 // CHECK-LABEL: @test_vcmlaq_f32(
  32 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
  33 // CHECK: ret <4 x float> [[RES]]
  34 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  35   return vcmlaq_f32(acc, lhs, rhs);
  36 }
  37
  38 // CHECK-LABEL: @test_vcmlaq_f64(
  39 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
  40 // CHECK: ret <2 x double> [[RES]]
  41 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  42   return vcmlaq_f64(acc, lhs, rhs);
  43 }
  44
  45 // CHECK-LABEL: @test_vcmla_rot90_f16(
  46 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
  47 // CHECK: ret <4 x half> [[RES]]
  48 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  49   return vcmla_rot90_f16(acc, lhs, rhs);
  50 }
  51
  52 // CHECK-LABEL: @test_vcmla_rot90_f32(
  53 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
  54 // CHECK: ret <2 x float> [[RES]]
  55 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  56   return vcmla_rot90_f32(acc, lhs, rhs);
  57 }
  58
  59 // CHECK-LABEL: @test_vcmlaq_rot90_f16(
  60 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
  61 // CHECK: ret <8 x half> [[RES]]
  62 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  63   return vcmlaq_rot90_f16(acc, lhs, rhs);
  64 }
  65
  66 // CHECK-LABEL: @test_vcmlaq_rot90_f32(
  67 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
  68 // CHECK: ret <4 x float> [[RES]]
  69 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  70   return vcmlaq_rot90_f32(acc, lhs, rhs);
  71 }
  72
  73 // CHECK-LABEL: @test_vcmlaq_rot90_f64(
  74 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
  75 // CHECK: ret <2 x double> [[RES]]
  76 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
  77   return vcmlaq_rot90_f64(acc, lhs, rhs);
  78 }
  79
  80 // CHECK-LABEL: @test_vcmla_rot180_f16(
  81 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
  82 // CHECK: ret <4 x half> [[RES]]
  83 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  84   return vcmla_rot180_f16(acc, lhs, rhs);
  85 }
  86
  87 // CHECK-LABEL: @test_vcmla_rot180_f32(
  88 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
  89 // CHECK: ret <2 x float> [[RES]]
  90 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
  91   return vcmla_rot180_f32(acc, lhs, rhs);
  92 }
  93
  94 // CHECK-LABEL: @test_vcmlaq_rot180_f16(
  95 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
  96 // CHECK: ret <8 x half> [[RES]]
  97 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
  98   return vcmlaq_rot180_f16(acc, lhs, rhs);
  99 }
 100
 101 // CHECK-LABEL: @test_vcmlaq_rot180_f32(
 102 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
 103 // CHECK: ret <4 x float> [[RES]]
 104 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 105   return vcmlaq_rot180_f32(acc, lhs, rhs);
 106 }
 107
 108 // CHECK-LABEL: @test_vcmlaq_rot180_f64(
 109 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
 110 // CHECK: ret <2 x double> [[RES]]
 111 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
 112   return vcmlaq_rot180_f64(acc, lhs, rhs);
 113 }
 114
 115 // CHECK-LABEL: @test_vcmla_rot270_f16(
 116 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
 117 // CHECK: ret <4 x half> [[RES]]
 118 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
 119   return vcmla_rot270_f16(acc, lhs, rhs);
 120 }
 121
 122 // CHECK-LABEL: @test_vcmla_rot270_f32(
 123 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
 124 // CHECK: ret <2 x float> [[RES]]
 125 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
 126   return vcmla_rot270_f32(acc, lhs, rhs);
 127 }
 128
 129 // CHECK-LABEL: @test_vcmlaq_rot270_f16(
 130 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
 131 // CHECK: ret <8 x half> [[RES]]
 132 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
 133   return vcmlaq_rot270_f16(acc, lhs, rhs);
 134 }
 135
 136 // CHECK-LABEL: @test_vcmlaq_rot270_f32(
 137 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
 138 // CHECK: ret <4 x float> [[RES]]
 139 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 140   return vcmlaq_rot270_f32(acc, lhs, rhs);
 141 }
 142
 143 // CHECK-LABEL: @test_vcmlaq_rot270_f64(
 144 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
 145 // CHECK: ret <2 x double> [[RES]]
 146 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
 147   return vcmlaq_rot270_f64(acc, lhs, rhs);
 148 }
 149
 150 // CHECK-LABEL: @test_vcmla_lane_f16(
 151 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 152 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 153 // CHECK: ret <4 x half> [[RES]]
 154 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
 155   return vcmla_lane_f16(acc, lhs, rhs, 1);
 156 }
 157
 158 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 159 // CHECK-LABEL: @test_vcmla_laneq_f16(
 160 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 161 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 162 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
 163 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
 164 // CHECK: ret <4 x half> [[RES]]
 165 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
 166   return vcmla_laneq_f16(acc, lhs, rhs, 3);
 167 }
 168
 169 // CHECK-LABEL: @test_vcmlaq_lane_f16(
 170 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 171 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
 172 // CHECK: ret <8 x half> [[RES]]
 173 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
 174   return vcmlaq_lane_f16(acc, lhs, rhs, 1);
 175 }
 176
 177 // CHECK-LABEL: @test_vcmlaq_laneq_f16(
 178 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 179 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 180 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
 181 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
 182 // CHECK: ret <8 x half> [[RES]]
 183 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
 184   return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
 185 }
 186
 187 // CHECK-LABEL: @test_vcmla_lane_f32(
 188 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
 189 // CHECK: ret <2 x float> [[RES]]
 190 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
 191   return vcmla_lane_f32(acc, lhs, rhs, 0);
 192 }
 193
 194 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 195 // CHECK-LABEL: @test_vcmla_laneq_f32(
 196 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 197 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
 198 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
 199 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
 200 // CHECK: ret <2 x float> [[RES]]
 201 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
 202   return vcmla_laneq_f32(acc, lhs, rhs, 1);
 203 }
 204
 205 // CHECK-LABEL: @test_vcmlaq_lane_f32(
 206 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 207 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
 208 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
 209 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 210 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 211 // CHECK: ret <4 x float> [[RES]]
 212 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
 213   return vcmlaq_lane_f32(acc, lhs, rhs, 0);
 214 }
 215
 216 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
 217 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 218 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 219 // CHECK: ret <4 x float> [[RES]]
 220 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 221   return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
 222 }
 223
 224 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
 225 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 226 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 227 // CHECK: ret <4 x half> [[RES]]
 228 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
 229   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
 230 }
 231
 232 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 233 // CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
 234 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 235 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 236 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
 237 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
 238 // CHECK: ret <4 x half> [[RES]]
 239 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
 240   return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
 241 }
 242
 243 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
 244 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 245 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
 246 // CHECK: ret <8 x half> [[RES]]
 247 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
 248   return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
 249 }
 250
 251 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
 252 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 253 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 254 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
 255 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
 256 // CHECK: ret <8 x half> [[RES]]
 257 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
 258   return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
 259 }
 260
 261 // CHECK-LABEL: @test_vcmla_rot90_lane_f32(
 262 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
 263 // CHECK: ret <2 x float> [[RES]]
 264 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
 265   return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
 266 }
 267
 268 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 269 // CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
 270 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 271 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
 272 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
 273 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
 274 // CHECK: ret <2 x float> [[RES]]
 275 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
 276   return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
 277 }
 278
 279 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
 280 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 281 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
 282 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
 283 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 284 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 285 // CHECK: ret <4 x float> [[RES]]
 286 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
 287   return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
 288 }
 289
 290 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
 291 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 292 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 293 // CHECK: ret <4 x float> [[RES]]
 294 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 295   return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
 296 }
 297
 298 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
 299 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 300 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 301 // CHECK: ret <4 x half> [[RES]]
 302 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
 303   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
 304 }
 305
 306 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 307 // CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
 308 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 309 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 310 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
 311 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
 312 // CHECK: ret <4 x half> [[RES]]
 313 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
 314   return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
 315 }
 316
 317 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
 318 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 319 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
 320 // CHECK: ret <8 x half> [[RES]]
 321 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
 322   return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
 323 }
 324
 325 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
 326 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 327 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 328 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
 329 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
 330 // CHECK: ret <8 x half> [[RES]]
 331 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
 332   return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
 333 }
 334
 335 // CHECK-LABEL: @test_vcmla_rot180_lane_f32(
 336 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
 337 // CHECK: ret <2 x float> [[RES]]
 338 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
 339   return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
 340 }
 341
 342 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 343 // CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
 344 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 345 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
 346 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
 347 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
 348 // CHECK: ret <2 x float> [[RES]]
 349 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
 350   return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
 351 }
 352
 353 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
 354 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 355 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
 356 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
 357 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 358 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 359 // CHECK: ret <4 x float> [[RES]]
 360 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
 361   return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
 362 }
 363
 364 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
 365 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 366 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 367 // CHECK: ret <4 x float> [[RES]]
 368 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 369   return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
 370 }
 371
 372 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
 373 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 374 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 375 // CHECK: ret <4 x half> [[RES]]
 376 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
 377   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
 378 }
 379
 380 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 381 // CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
 382 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 383 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 384 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
 385 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
 386 // CHECK: ret <4 x half> [[RES]]
 387 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
 388   return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
 389 }
 390
 391 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
 392 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 393 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
 394 // CHECK: ret <8 x half> [[RES]]
 395 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
 396   return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
 397 }
 398
 399 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
 400 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
 401 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 402 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
 403 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
 404 // CHECK: ret <8 x half> [[RES]]
 405 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
 406   return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
 407 }
 408
 409 // CHECK-LABEL: @test_vcmla_rot270_lane_f32(
 410 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
 411 // CHECK: ret <2 x float> [[RES]]
 412 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
 413   return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
 414 }
 415
 416 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
 417 // CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
 418 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 419 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
 420 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
 421 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
 422 // CHECK: ret <2 x float> [[RES]]
 423 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
 424   return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
 425 }
 426
 427 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
 428 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 429 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
 430 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
 431 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 432 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 433 // CHECK: ret <4 x float> [[RES]]
 434 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
 435   return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
 436 }
 437
 438 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
 439 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 440 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 441 // CHECK: ret <4 x float> [[RES]]
 442 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
 443   return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
 444 }