test/CodeGen/AArch64/arm64-vmul.ll

   1 ; RUN: llc < %s -asm-verbose=false -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
   2
   3
   4 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
   5 ;CHECK-LABEL: smull8h:
   6 ;CHECK: smull.8h
   7   %tmp1 = load <8 x i8>, <8 x i8>* %A
   8   %tmp2 = load <8 x i8>, <8 x i8>* %B
   9   %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
  10   ret <8 x i16> %tmp3
  11 }
  12
  13 define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
  14 ;CHECK-LABEL: smull4s:
  15 ;CHECK: smull.4s
  16   %tmp1 = load <4 x i16>, <4 x i16>* %A
  17   %tmp2 = load <4 x i16>, <4 x i16>* %B
  18   %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
  19   ret <4 x i32> %tmp3
  20 }
  21
  22 define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
  23 ;CHECK-LABEL: smull2d:
  24 ;CHECK: smull.2d
  25   %tmp1 = load <2 x i32>, <2 x i32>* %A
  26   %tmp2 = load <2 x i32>, <2 x i32>* %B
  27   %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
  28   ret <2 x i64> %tmp3
  29 }
  30
  31 declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
  32 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
  33 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
  34
  35 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
  36 ;CHECK-LABEL: umull8h:
  37 ;CHECK: umull.8h
  38   %tmp1 = load <8 x i8>, <8 x i8>* %A
  39   %tmp2 = load <8 x i8>, <8 x i8>* %B
  40   %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
  41   ret <8 x i16> %tmp3
  42 }
  43
  44 define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
  45 ;CHECK-LABEL: umull4s:
  46 ;CHECK: umull.4s
  47   %tmp1 = load <4 x i16>, <4 x i16>* %A
  48   %tmp2 = load <4 x i16>, <4 x i16>* %B
  49   %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
  50   ret <4 x i32> %tmp3
  51 }
  52
  53 define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
  54 ;CHECK-LABEL: umull2d:
  55 ;CHECK: umull.2d
  56   %tmp1 = load <2 x i32>, <2 x i32>* %A
  57   %tmp2 = load <2 x i32>, <2 x i32>* %B
  58   %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
  59   ret <2 x i64> %tmp3
  60 }
  61
  62 declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
  63 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
  64 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
  65
  66 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
  67 ;CHECK-LABEL: sqdmull4s:
  68 ;CHECK: sqdmull.4s
  69   %tmp1 = load <4 x i16>, <4 x i16>* %A
  70   %tmp2 = load <4 x i16>, <4 x i16>* %B
  71   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
  72   ret <4 x i32> %tmp3
  73 }
  74
  75 define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
  76 ;CHECK-LABEL: sqdmull2d:
  77 ;CHECK: sqdmull.2d
  78   %tmp1 = load <2 x i32>, <2 x i32>* %A
  79   %tmp2 = load <2 x i32>, <2 x i32>* %B
  80   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
  81   ret <2 x i64> %tmp3
  82 }
  83
  84 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
  85 ;CHECK-LABEL: sqdmull2_4s:
  86 ;CHECK: sqdmull.4s
  87   %load1 = load <8 x i16>, <8 x i16>* %A
  88   %load2 = load <8 x i16>, <8 x i16>* %B
  89   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  90   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  91   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
  92   ret <4 x i32> %tmp3
  93 }
  94
  95 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
  96 ;CHECK-LABEL: sqdmull2_2d:
  97 ;CHECK: sqdmull.2d
  98   %load1 = load <4 x i32>, <4 x i32>* %A
  99   %load2 = load <4 x i32>, <4 x i32>* %B
 100   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 101   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 102   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 103   ret <2 x i64> %tmp3
 104 }
 105
 106
 107 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 108 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 109
 110 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 111 ;CHECK-LABEL: pmull8h:
 112 ;CHECK: pmull.8h
 113   %tmp1 = load <8 x i8>, <8 x i8>* %A
 114   %tmp2 = load <8 x i8>, <8 x i8>* %B
 115   %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
 116   ret <8 x i16> %tmp3
 117 }
 118
 119 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
 120
 121 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 122 ;CHECK-LABEL: sqdmulh_4h:
 123 ;CHECK: sqdmulh.4h
 124   %tmp1 = load <4 x i16>, <4 x i16>* %A
 125   %tmp2 = load <4 x i16>, <4 x i16>* %B
 126   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 127   ret <4 x i16> %tmp3
 128 }
 129
 130 define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 131 ;CHECK-LABEL: sqdmulh_8h:
 132 ;CHECK: sqdmulh.8h
 133   %tmp1 = load <8 x i16>, <8 x i16>* %A
 134   %tmp2 = load <8 x i16>, <8 x i16>* %B
 135   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 136   ret <8 x i16> %tmp3
 137 }
 138
 139 define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 140 ;CHECK-LABEL: sqdmulh_2s:
 141 ;CHECK: sqdmulh.2s
 142   %tmp1 = load <2 x i32>, <2 x i32>* %A
 143   %tmp2 = load <2 x i32>, <2 x i32>* %B
 144   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 145   ret <2 x i32> %tmp3
 146 }
 147
 148 define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 149 ;CHECK-LABEL: sqdmulh_4s:
 150 ;CHECK: sqdmulh.4s
 151   %tmp1 = load <4 x i32>, <4 x i32>* %A
 152   %tmp2 = load <4 x i32>, <4 x i32>* %B
 153   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 154   ret <4 x i32> %tmp3
 155 }
 156
 157 define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
 158 ;CHECK-LABEL: sqdmulh_1s:
 159 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
 160   %tmp1 = load i32, i32* %A
 161   %tmp2 = load i32, i32* %B
 162   %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
 163   ret i32 %tmp3
 164 }
 165
 166 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 167 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 168 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 169 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 170 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
 171
 172 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 173 ;CHECK-LABEL: sqrdmulh_4h:
 174 ;CHECK: sqrdmulh.4h
 175   %tmp1 = load <4 x i16>, <4 x i16>* %A
 176   %tmp2 = load <4 x i16>, <4 x i16>* %B
 177   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 178   ret <4 x i16> %tmp3
 179 }
 180
 181 define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 182 ;CHECK-LABEL: sqrdmulh_8h:
 183 ;CHECK: sqrdmulh.8h
 184   %tmp1 = load <8 x i16>, <8 x i16>* %A
 185   %tmp2 = load <8 x i16>, <8 x i16>* %B
 186   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 187   ret <8 x i16> %tmp3
 188 }
 189
 190 define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 191 ;CHECK-LABEL: sqrdmulh_2s:
 192 ;CHECK: sqrdmulh.2s
 193   %tmp1 = load <2 x i32>, <2 x i32>* %A
 194   %tmp2 = load <2 x i32>, <2 x i32>* %B
 195   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 196   ret <2 x i32> %tmp3
 197 }
 198
 199 define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 200 ;CHECK-LABEL: sqrdmulh_4s:
 201 ;CHECK: sqrdmulh.4s
 202   %tmp1 = load <4 x i32>, <4 x i32>* %A
 203   %tmp2 = load <4 x i32>, <4 x i32>* %B
 204   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 205   ret <4 x i32> %tmp3
 206 }
 207
 208 define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
 209 ;CHECK-LABEL: sqrdmulh_1s:
 210 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
 211   %tmp1 = load i32, i32* %A
 212   %tmp2 = load i32, i32* %B
 213   %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
 214   ret i32 %tmp3
 215 }
 216
 217 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 218 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 219 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 220 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 221 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
 222
 223 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 224 ;CHECK-LABEL: fmulx_2s:
 225 ;CHECK: fmulx.2s
 226   %tmp1 = load <2 x float>, <2 x float>* %A
 227   %tmp2 = load <2 x float>, <2 x float>* %B
 228   %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 229   ret <2 x float> %tmp3
 230 }
 231
 232 define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 233 ;CHECK-LABEL: fmulx_4s:
 234 ;CHECK: fmulx.4s
 235   %tmp1 = load <4 x float>, <4 x float>* %A
 236   %tmp2 = load <4 x float>, <4 x float>* %B
 237   %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 238   ret <4 x float> %tmp3
 239 }
 240
 241 define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 242 ;CHECK-LABEL: fmulx_2d:
 243 ;CHECK: fmulx.2d
 244   %tmp1 = load <2 x double>, <2 x double>* %A
 245   %tmp2 = load <2 x double>, <2 x double>* %B
 246   %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 247   ret <2 x double> %tmp3
 248 }
 249
 250 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
 251 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
 252 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
 253
 254 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 255 ;CHECK-LABEL: smlal4s:
 256 ;CHECK: smlal.4s
 257   %tmp1 = load <4 x i16>, <4 x i16>* %A
 258   %tmp2 = load <4 x i16>, <4 x i16>* %B
 259   %tmp3 = load <4 x i32>, <4 x i32>* %C
 260   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 261   %tmp5 = add <4 x i32> %tmp3, %tmp4
 262   ret <4 x i32> %tmp5
 263 }
 264
 265 define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 266 ;CHECK-LABEL: smlal2d:
 267 ;CHECK: smlal.2d
 268   %tmp1 = load <2 x i32>, <2 x i32>* %A
 269   %tmp2 = load <2 x i32>, <2 x i32>* %B
 270   %tmp3 = load <2 x i64>, <2 x i64>* %C
 271   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 272   %tmp5 = add <2 x i64> %tmp3, %tmp4
 273   ret <2 x i64> %tmp5
 274 }
 275
 276 define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 277 ;CHECK-LABEL: smlsl4s:
 278 ;CHECK: smlsl.4s
 279   %tmp1 = load <4 x i16>, <4 x i16>* %A
 280   %tmp2 = load <4 x i16>, <4 x i16>* %B
 281   %tmp3 = load <4 x i32>, <4 x i32>* %C
 282   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 283   %tmp5 = sub <4 x i32> %tmp3, %tmp4
 284   ret <4 x i32> %tmp5
 285 }
 286
 287 define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 288 ;CHECK-LABEL: smlsl2d:
 289 ;CHECK: smlsl.2d
 290   %tmp1 = load <2 x i32>, <2 x i32>* %A
 291   %tmp2 = load <2 x i32>, <2 x i32>* %B
 292   %tmp3 = load <2 x i64>, <2 x i64>* %C
 293   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 294   %tmp5 = sub <2 x i64> %tmp3, %tmp4
 295   ret <2 x i64> %tmp5
 296 }
 297
 298 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 299 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 300 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 301 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 302
 303 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 304 ;CHECK-LABEL: sqdmlal4s:
 305 ;CHECK: sqdmlal.4s
 306   %tmp1 = load <4 x i16>, <4 x i16>* %A
 307   %tmp2 = load <4 x i16>, <4 x i16>* %B
 308   %tmp3 = load <4 x i32>, <4 x i32>* %C
 309   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 310   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
 311   ret <4 x i32> %tmp5
 312 }
 313
 314 define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 315 ;CHECK-LABEL: sqdmlal2d:
 316 ;CHECK: sqdmlal.2d
 317   %tmp1 = load <2 x i32>, <2 x i32>* %A
 318   %tmp2 = load <2 x i32>, <2 x i32>* %B
 319   %tmp3 = load <2 x i64>, <2 x i64>* %C
 320   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 321   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
 322   ret <2 x i64> %tmp5
 323 }
 324
 325 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 326 ;CHECK-LABEL: sqdmlal2_4s:
 327 ;CHECK: sqdmlal.4s
 328   %load1 = load <8 x i16>, <8 x i16>* %A
 329   %load2 = load <8 x i16>, <8 x i16>* %B
 330   %tmp3 = load <4 x i32>, <4 x i32>* %C
 331   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 332   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 333   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 334   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
 335   ret <4 x i32> %tmp5
 336 }
 337
 338 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 339 ;CHECK-LABEL: sqdmlal2_2d:
 340 ;CHECK: sqdmlal.2d
 341   %load1 = load <4 x i32>, <4 x i32>* %A
 342   %load2 = load <4 x i32>, <4 x i32>* %B
 343   %tmp3 = load <2 x i64>, <2 x i64>* %C
 344   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 345   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 346   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 347   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
 348   ret <2 x i64> %tmp5
 349 }
 350
 351 define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 352 ;CHECK-LABEL: sqdmlsl4s:
 353 ;CHECK: sqdmlsl.4s
 354   %tmp1 = load <4 x i16>, <4 x i16>* %A
 355   %tmp2 = load <4 x i16>, <4 x i16>* %B
 356   %tmp3 = load <4 x i32>, <4 x i32>* %C
 357   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 358   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
 359   ret <4 x i32> %tmp5
 360 }
 361
 362 define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 363 ;CHECK-LABEL: sqdmlsl2d:
 364 ;CHECK: sqdmlsl.2d
 365   %tmp1 = load <2 x i32>, <2 x i32>* %A
 366   %tmp2 = load <2 x i32>, <2 x i32>* %B
 367   %tmp3 = load <2 x i64>, <2 x i64>* %C
 368   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 369   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
 370   ret <2 x i64> %tmp5
 371 }
 372
 373 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 374 ;CHECK-LABEL: sqdmlsl2_4s:
 375 ;CHECK: sqdmlsl.4s
 376   %load1 = load <8 x i16>, <8 x i16>* %A
 377   %load2 = load <8 x i16>, <8 x i16>* %B
 378   %tmp3 = load <4 x i32>, <4 x i32>* %C
 379   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 380   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 381   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 382   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
 383   ret <4 x i32> %tmp5
 384 }
 385
 386 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 387 ;CHECK-LABEL: sqdmlsl2_2d:
 388 ;CHECK: sqdmlsl.2d
 389   %load1 = load <4 x i32>, <4 x i32>* %A
 390   %load2 = load <4 x i32>, <4 x i32>* %B
 391   %tmp3 = load <2 x i64>, <2 x i64>* %C
 392   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 393   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 394   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 395   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
 396   ret <2 x i64> %tmp5
 397 }
 398
 399 define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 400 ;CHECK-LABEL: umlal4s:
 401 ;CHECK: umlal.4s
 402   %tmp1 = load <4 x i16>, <4 x i16>* %A
 403   %tmp2 = load <4 x i16>, <4 x i16>* %B
 404   %tmp3 = load <4 x i32>, <4 x i32>* %C
 405   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 406   %tmp5 = add <4 x i32> %tmp3, %tmp4
 407   ret <4 x i32> %tmp5
 408 }
 409
 410 define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 411 ;CHECK-LABEL: umlal2d:
 412 ;CHECK: umlal.2d
 413   %tmp1 = load <2 x i32>, <2 x i32>* %A
 414   %tmp2 = load <2 x i32>, <2 x i32>* %B
 415   %tmp3 = load <2 x i64>, <2 x i64>* %C
 416   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 417   %tmp5 = add <2 x i64> %tmp3, %tmp4
 418   ret <2 x i64> %tmp5
 419 }
 420
 421 define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 422 ;CHECK-LABEL: umlsl4s:
 423 ;CHECK: umlsl.4s
 424   %tmp1 = load <4 x i16>, <4 x i16>* %A
 425   %tmp2 = load <4 x i16>, <4 x i16>* %B
 426   %tmp3 = load <4 x i32>, <4 x i32>* %C
 427   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 428   %tmp5 = sub <4 x i32> %tmp3, %tmp4
 429   ret <4 x i32> %tmp5
 430 }
 431
 432 define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 433 ;CHECK-LABEL: umlsl2d:
 434 ;CHECK: umlsl.2d
 435   %tmp1 = load <2 x i32>, <2 x i32>* %A
 436   %tmp2 = load <2 x i32>, <2 x i32>* %B
 437   %tmp3 = load <2 x i64>, <2 x i64>* %C
 438   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 439   %tmp5 = sub <2 x i64> %tmp3, %tmp4
 440   ret <2 x i64> %tmp5
 441 }
 442
 443 define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
 444 ;CHECK-LABEL: fmla_2s:
 445 ;CHECK: fmla.2s
 446   %tmp1 = load <2 x float>, <2 x float>* %A
 447   %tmp2 = load <2 x float>, <2 x float>* %B
 448   %tmp3 = load <2 x float>, <2 x float>* %C
 449   %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
 450   ret <2 x float> %tmp4
 451 }
 452
 453 define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
 454 ;CHECK-LABEL: fmla_4s:
 455 ;CHECK: fmla.4s
 456   %tmp1 = load <4 x float>, <4 x float>* %A
 457   %tmp2 = load <4 x float>, <4 x float>* %B
 458   %tmp3 = load <4 x float>, <4 x float>* %C
 459   %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
 460   ret <4 x float> %tmp4
 461 }
 462
 463 define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
 464 ;CHECK-LABEL: fmla_2d:
 465 ;CHECK: fmla.2d
 466   %tmp1 = load <2 x double>, <2 x double>* %A
 467   %tmp2 = load <2 x double>, <2 x double>* %B
 468   %tmp3 = load <2 x double>, <2 x double>* %C
 469   %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
 470   ret <2 x double> %tmp4
 471 }
 472
 473 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
 474 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
 475 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
 476
 477 define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
 478 ;CHECK-LABEL: fmls_2s:
 479 ;CHECK: fmls.2s
 480   %tmp1 = load <2 x float>, <2 x float>* %A
 481   %tmp2 = load <2 x float>, <2 x float>* %B
 482   %tmp3 = load <2 x float>, <2 x float>* %C
 483   %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
 484   %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
 485   ret <2 x float> %tmp5
 486 }
 487
 488 define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
 489 ;CHECK-LABEL: fmls_4s:
 490 ;CHECK: fmls.4s
 491   %tmp1 = load <4 x float>, <4 x float>* %A
 492   %tmp2 = load <4 x float>, <4 x float>* %B
 493   %tmp3 = load <4 x float>, <4 x float>* %C
 494   %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
 495   %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
 496   ret <4 x float> %tmp5
 497 }
 498
 499 define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
 500 ;CHECK-LABEL: fmls_2d:
 501 ;CHECK: fmls.2d
 502   %tmp1 = load <2 x double>, <2 x double>* %A
 503   %tmp2 = load <2 x double>, <2 x double>* %B
 504   %tmp3 = load <2 x double>, <2 x double>* %C
 505   %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
 506   %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
 507   ret <2 x double> %tmp5
 508 }
 509
 510 define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
 511 ;CHECK-LABEL: fmls_commuted_neg_2s:
 512 ;CHECK: fmls.2s
 513   %tmp1 = load <2 x float>, <2 x float>* %A
 514   %tmp2 = load <2 x float>, <2 x float>* %B
 515   %tmp3 = load <2 x float>, <2 x float>* %C
 516   %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
 517   %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
 518   ret <2 x float> %tmp5
 519 }
 520
 521 define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
 522 ;CHECK-LABEL: fmls_commuted_neg_4s:
 523 ;CHECK: fmls.4s
 524   %tmp1 = load <4 x float>, <4 x float>* %A
 525   %tmp2 = load <4 x float>, <4 x float>* %B
 526   %tmp3 = load <4 x float>, <4 x float>* %C
 527   %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
 528   %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
 529   ret <4 x float> %tmp5
 530 }
 531
 532 define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
 533 ;CHECK-LABEL: fmls_commuted_neg_2d:
 534 ;CHECK: fmls.2d
 535   %tmp1 = load <2 x double>, <2 x double>* %A
 536   %tmp2 = load <2 x double>, <2 x double>* %B
 537   %tmp3 = load <2 x double>, <2 x double>* %C
 538   %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
 539   %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
 540   ret <2 x double> %tmp5
 541 }
 542
 543 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
 544 ;CHECK-LABEL: fmls_indexed_2s:
 545 ;CHECK: fmls.2s
 546 entry:
 547   %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
 548   %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
 549   %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
 550   ret <2 x float> %fmls1
 551 }
 552
 553 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
 554 ;CHECK-LABEL: fmls_indexed_4s:
 555 ;CHECK: fmls.4s
 556 entry:
 557   %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
 558   %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
 559   %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
 560   ret <4 x float> %fmls1
 561 }
 562
 563 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
 564 ;CHECK-LABEL: fmls_indexed_2d:
 565 ;CHECK: fmls.2d
 566 entry:
 567   %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
 568   %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
 569   %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
 570   ret <2 x double> %fmls1
 571 }
 572
 573 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
 574 entry:
 575 ; CHECK-LABEL: fmla_indexed_scalar_2s:
 576 ; CHECK-NEXT: fmla.2s
 577 ; CHECK-NEXT: ret
 578   %v1 = insertelement <2 x float> undef, float %c, i32 0
 579   %v2 = insertelement <2 x float> %v1, float %c, i32 1
 580   %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
 581   ret <2 x float> %fmla1
 582 }
 583
 584 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
 585 entry:
 586 ; CHECK-LABEL: fmla_indexed_scalar_4s:
 587 ; CHECK-NEXT: fmla.4s
 588 ; CHECK-NEXT: ret
 589   %v1 = insertelement <4 x float> undef, float %c, i32 0
 590   %v2 = insertelement <4 x float> %v1, float %c, i32 1
 591   %v3 = insertelement <4 x float> %v2, float %c, i32 2
 592   %v4 = insertelement <4 x float> %v3, float %c, i32 3
 593   %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
 594   ret <4 x float> %fmla1
 595 }
 596
 597 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
 598 ; CHECK-LABEL: fmla_indexed_scalar_2d:
 599 ; CHECK-NEXT: fmla.2d
 600 ; CHECK-NEXT: ret
 601 entry:
 602   %v1 = insertelement <2 x double> undef, double %c, i32 0
 603   %v2 = insertelement <2 x double> %v1, double %c, i32 1
 604   %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
 605   ret <2 x double> %fmla1
 606 }
 607
 608 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 609 ;CHECK-LABEL: mul_4h:
 610 ;CHECK-NOT: dup
 611 ;CHECK: mul.4h
 612   %tmp1 = load <4 x i16>, <4 x i16>* %A
 613   %tmp2 = load <4 x i16>, <4 x i16>* %B
 614   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 615   %tmp4 = mul <4 x i16> %tmp1, %tmp3
 616   ret <4 x i16> %tmp4
 617 }
 618
 619 define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 620 ;CHECK-LABEL: mul_8h:
 621 ;CHECK-NOT: dup
 622 ;CHECK: mul.8h
 623   %tmp1 = load <8 x i16>, <8 x i16>* %A
 624   %tmp2 = load <8 x i16>, <8 x i16>* %B
 625   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 626   %tmp4 = mul <8 x i16> %tmp1, %tmp3
 627   ret <8 x i16> %tmp4
 628 }
 629
 630 define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 631 ;CHECK-LABEL: mul_2s:
 632 ;CHECK-NOT: dup
 633 ;CHECK: mul.2s
 634   %tmp1 = load <2 x i32>, <2 x i32>* %A
 635   %tmp2 = load <2 x i32>, <2 x i32>* %B
 636   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 637   %tmp4 = mul <2 x i32> %tmp1, %tmp3
 638   ret <2 x i32> %tmp4
 639 }
 640
 641 define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 642 ;CHECK-LABEL: mul_4s:
 643 ;CHECK-NOT: dup
 644 ;CHECK: mul.4s
 645   %tmp1 = load <4 x i32>, <4 x i32>* %A
 646   %tmp2 = load <4 x i32>, <4 x i32>* %B
 647   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 648   %tmp4 = mul <4 x i32> %tmp1, %tmp3
 649   ret <4 x i32> %tmp4
 650 }
 651
 652 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
 653 ; CHECK-LABEL: mul_2d:
 654 ; CHECK: mul
 655 ; CHECK: mul
 656   %tmp1 = mul <2 x i64> %A, %B
 657   ret <2 x i64> %tmp1
 658 }
 659
 660 define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 661 ;CHECK-LABEL: fmul_lane_2s:
 662 ;CHECK-NOT: dup
 663 ;CHECK: fmul.2s
 664   %tmp1 = load <2 x float>, <2 x float>* %A
 665   %tmp2 = load <2 x float>, <2 x float>* %B
 666   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
 667   %tmp4 = fmul <2 x float> %tmp1, %tmp3
 668   ret <2 x float> %tmp4
 669 }
 670
 671 define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 672 ;CHECK-LABEL: fmul_lane_4s:
 673 ;CHECK-NOT: dup
 674 ;CHECK: fmul.4s
 675   %tmp1 = load <4 x float>, <4 x float>* %A
 676   %tmp2 = load <4 x float>, <4 x float>* %B
 677   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 678   %tmp4 = fmul <4 x float> %tmp1, %tmp3
 679   ret <4 x float> %tmp4
 680 }
 681
 682 define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 683 ;CHECK-LABEL: fmul_lane_2d:
 684 ;CHECK-NOT: dup
 685 ;CHECK: fmul.2d
 686   %tmp1 = load <2 x double>, <2 x double>* %A
 687   %tmp2 = load <2 x double>, <2 x double>* %B
 688   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
 689   %tmp4 = fmul <2 x double> %tmp1, %tmp3
 690   ret <2 x double> %tmp4
 691 }
 692
 693 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
 694 ;CHECK-LABEL: fmul_lane_s:
 695 ;CHECK-NOT: dup
 696 ;CHECK: fmul.s s0, s0, v1[3]
 697   %B = extractelement <4 x float> %vec, i32 3
 698   %res = fmul float %A, %B
 699   ret float %res
 700 }
 701
 702 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
 703 ;CHECK-LABEL: fmul_lane_d:
 704 ;CHECK-NOT: dup
 705 ;CHECK: fmul.d d0, d0, v1[1]
 706   %B = extractelement <2 x double> %vec, i32 1
 707   %res = fmul double %A, %B
 708   ret double %res
 709 }
 710
 711
 712
 713 define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 714 ;CHECK-LABEL: fmulx_lane_2s:
 715 ;CHECK-NOT: dup
 716 ;CHECK: fmulx.2s
 717   %tmp1 = load <2 x float>, <2 x float>* %A
 718   %tmp2 = load <2 x float>, <2 x float>* %B
 719   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
 720   %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
 721   ret <2 x float> %tmp4
 722 }
 723
 724 define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 725 ;CHECK-LABEL: fmulx_lane_4s:
 726 ;CHECK-NOT: dup
 727 ;CHECK: fmulx.4s
 728   %tmp1 = load <4 x float>, <4 x float>* %A
 729   %tmp2 = load <4 x float>, <4 x float>* %B
 730   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 731   %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
 732   ret <4 x float> %tmp4
 733 }
 734
 735 define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 736 ;CHECK-LABEL: fmulx_lane_2d:
 737 ;CHECK-NOT: dup
 738 ;CHECK: fmulx.2d
 739   %tmp1 = load <2 x double>, <2 x double>* %A
 740   %tmp2 = load <2 x double>, <2 x double>* %B
 741   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
 742   %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
 743   ret <2 x double> %tmp4
 744 }
 745
 746 define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 747 ;CHECK-LABEL: sqdmulh_lane_4h:
 748 ;CHECK-NOT: dup
 749 ;CHECK: sqdmulh.4h
 750   %tmp1 = load <4 x i16>, <4 x i16>* %A
 751   %tmp2 = load <4 x i16>, <4 x i16>* %B
 752   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 753   %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
 754   ret <4 x i16> %tmp4
 755 }
 756
 757 define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 758 ;CHECK-LABEL: sqdmulh_lane_8h:
 759 ;CHECK-NOT: dup
 760 ;CHECK: sqdmulh.8h
 761   %tmp1 = load <8 x i16>, <8 x i16>* %A
 762   %tmp2 = load <8 x i16>, <8 x i16>* %B
 763   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 764   %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
 765   ret <8 x i16> %tmp4
 766 }
 767
 768 define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 769 ;CHECK-LABEL: sqdmulh_lane_2s:
 770 ;CHECK-NOT: dup
 771 ;CHECK: sqdmulh.2s
 772   %tmp1 = load <2 x i32>, <2 x i32>* %A
 773   %tmp2 = load <2 x i32>, <2 x i32>* %B
 774   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 775   %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
 776   ret <2 x i32> %tmp4
 777 }
 778
 779 define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 780 ;CHECK-LABEL: sqdmulh_lane_4s:
 781 ;CHECK-NOT: dup
 782 ;CHECK: sqdmulh.4s
 783   %tmp1 = load <4 x i32>, <4 x i32>* %A
 784   %tmp2 = load <4 x i32>, <4 x i32>* %B
 785   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 786   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
 787   ret <4 x i32> %tmp4
 788 }
 789
 790 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 791 ;CHECK-LABEL: sqdmulh_lane_1s:
 792 ;CHECK-NOT: dup
 793 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
 794   %tmp1 = extractelement <4 x i32> %B, i32 1
 795   %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
 796   ret i32 %tmp2
 797 }
 798
 799 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 800 ;CHECK-LABEL: sqrdmulh_lane_4h:
 801 ;CHECK-NOT: dup
 802 ;CHECK: sqrdmulh.4h
 803   %tmp1 = load <4 x i16>, <4 x i16>* %A
 804   %tmp2 = load <4 x i16>, <4 x i16>* %B
 805   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 806   %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
 807   ret <4 x i16> %tmp4
 808 }
 809
 810 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 811 ;CHECK-LABEL: sqrdmulh_lane_8h:
 812 ;CHECK-NOT: dup
 813 ;CHECK: sqrdmulh.8h
 814   %tmp1 = load <8 x i16>, <8 x i16>* %A
 815   %tmp2 = load <8 x i16>, <8 x i16>* %B
 816   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 817   %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
 818   ret <8 x i16> %tmp4
 819 }
 820
 821 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 822 ;CHECK-LABEL: sqrdmulh_lane_2s:
 823 ;CHECK-NOT: dup
 824 ;CHECK: sqrdmulh.2s
 825   %tmp1 = load <2 x i32>, <2 x i32>* %A
 826   %tmp2 = load <2 x i32>, <2 x i32>* %B
 827   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 828   %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
 829   ret <2 x i32> %tmp4
 830 }
 831
 832 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 833 ;CHECK-LABEL: sqrdmulh_lane_4s:
 834 ;CHECK-NOT: dup
 835 ;CHECK: sqrdmulh.4s
 836   %tmp1 = load <4 x i32>, <4 x i32>* %A
 837   %tmp2 = load <4 x i32>, <4 x i32>* %B
 838   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 839   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
 840   ret <4 x i32> %tmp4
 841 }
 842
 843 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 844 ;CHECK-LABEL: sqrdmulh_lane_1s:
 845 ;CHECK-NOT: dup
 846 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
 847   %tmp1 = extractelement <4 x i32> %B, i32 1
 848   %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
 849   ret i32 %tmp2
 850 }
 851
 852 define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 853 ;CHECK-LABEL: sqdmull_lane_4s:
 854 ;CHECK-NOT: dup
 855 ;CHECK: sqdmull.4s
 856   %tmp1 = load <4 x i16>, <4 x i16>* %A
 857   %tmp2 = load <4 x i16>, <4 x i16>* %B
 858   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 859   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
 860   ret <4 x i32> %tmp4
 861 }
 862
 863 define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 864 ;CHECK-LABEL: sqdmull_lane_2d:
 865 ;CHECK-NOT: dup
 866 ;CHECK: sqdmull.2d
 867   %tmp1 = load <2 x i32>, <2 x i32>* %A
 868   %tmp2 = load <2 x i32>, <2 x i32>* %B
 869   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 870   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
 871   ret <2 x i64> %tmp4
 872 }
 873
 874 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 875 ;CHECK-LABEL: sqdmull2_lane_4s:
 876 ;CHECK-NOT: dup
 877 ;CHECK: sqdmull.4s
 878   %load1 = load <8 x i16>, <8 x i16>* %A
 879   %load2 = load <8 x i16>, <8 x i16>* %B
 880   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 881   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 882   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
 883   ret <4 x i32> %tmp4
 884 }
 885
 886 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 887 ;CHECK-LABEL: sqdmull2_lane_2d:
 888 ;CHECK-NOT: dup
 889 ;CHECK: sqdmull.2d
 890   %load1 = load <4 x i32>, <4 x i32>* %A
 891   %load2 = load <4 x i32>, <4 x i32>* %B
 892   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 893   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
 894   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
 895   ret <2 x i64> %tmp4
 896 }
 897
 898 define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 899 ;CHECK-LABEL: umull_lane_4s:
 900 ;CHECK-NOT: dup
 901 ;CHECK: umull.4s
 902   %tmp1 = load <4 x i16>, <4 x i16>* %A
 903   %tmp2 = load <4 x i16>, <4 x i16>* %B
 904   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 905   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
 906   ret <4 x i32> %tmp4
 907 }
 908
 909 define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 910 ;CHECK-LABEL: umull_lane_2d:
 911 ;CHECK-NOT: dup
 912 ;CHECK: umull.2d
 913   %tmp1 = load <2 x i32>, <2 x i32>* %A
 914   %tmp2 = load <2 x i32>, <2 x i32>* %B
 915   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 916   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
 917   ret <2 x i64> %tmp4
 918 }
 919
 920 define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 921 ;CHECK-LABEL: smull_lane_4s:
 922 ;CHECK-NOT: dup
 923 ;CHECK: smull.4s
 924   %tmp1 = load <4 x i16>, <4 x i16>* %A
 925   %tmp2 = load <4 x i16>, <4 x i16>* %B
 926   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 927   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
 928   ret <4 x i32> %tmp4
 929 }
 930
 931 define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 932 ;CHECK-LABEL: smull_lane_2d:
 933 ;CHECK-NOT: dup
 934 ;CHECK: smull.2d
 935   %tmp1 = load <2 x i32>, <2 x i32>* %A
 936   %tmp2 = load <2 x i32>, <2 x i32>* %B
 937   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 938   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
 939   ret <2 x i64> %tmp4
 940 }
 941
 942 define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 943 ;CHECK-LABEL: smlal_lane_4s:
 944 ;CHECK-NOT: dup
 945 ;CHECK: smlal.4s
 946   %tmp1 = load <4 x i16>, <4 x i16>* %A
 947   %tmp2 = load <4 x i16>, <4 x i16>* %B
 948   %tmp3 = load <4 x i32>, <4 x i32>* %C
 949   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 950   %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
 951   %tmp6 = add <4 x i32> %tmp3, %tmp5
 952   ret <4 x i32> %tmp6
 953 }
 954
 955 define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 956 ;CHECK-LABEL: smlal_lane_2d:
 957 ;CHECK-NOT: dup
 958 ;CHECK: smlal.2d
 959   %tmp1 = load <2 x i32>, <2 x i32>* %A
 960   %tmp2 = load <2 x i32>, <2 x i32>* %B
 961   %tmp3 = load <2 x i64>, <2 x i64>* %C
 962   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 963   %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
 964   %tmp6 = add <2 x i64> %tmp3, %tmp5
 965   ret <2 x i64> %tmp6
 966 }
 967
 968 define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 969 ;CHECK-LABEL: sqdmlal_lane_4s:
 970 ;CHECK-NOT: dup
 971 ;CHECK: sqdmlal.4s
 972   %tmp1 = load <4 x i16>, <4 x i16>* %A
 973   %tmp2 = load <4 x i16>, <4 x i16>* %B
 974   %tmp3 = load <4 x i32>, <4 x i32>* %C
 975   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 976   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
 977   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
 978   ret <4 x i32> %tmp6
 979 }
 980
 981 define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
 982 ;CHECK-LABEL: sqdmlal_lane_2d:
 983 ;CHECK-NOT: dup
 984 ;CHECK: sqdmlal.2d
 985   %tmp1 = load <2 x i32>, <2 x i32>* %A
 986   %tmp2 = load <2 x i32>, <2 x i32>* %B
 987   %tmp3 = load <2 x i64>, <2 x i64>* %C
 988   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
 989   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
 990   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
 991   ret <2 x i64> %tmp6
 992 }
 993
 994 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 995 ;CHECK-LABEL: sqdmlal2_lane_4s:
 996 ;CHECK-NOT: dup
 997 ;CHECK: sqdmlal.4s
 998   %load1 = load <8 x i16>, <8 x i16>* %A
 999   %load2 = load <8 x i16>, <8 x i16>* %B
1000   %tmp3 = load <4 x i32>, <4 x i32>* %C
1001   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1002   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1003   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1004   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1005   ret <4 x i32> %tmp6
1006 }
1007
1008 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1009 ;CHECK-LABEL: sqdmlal2_lane_2d:
1010 ;CHECK-NOT: dup
1011 ;CHECK: sqdmlal.2d
1012   %load1 = load <4 x i32>, <4 x i32>* %A
1013   %load2 = load <4 x i32>, <4 x i32>* %B
1014   %tmp3 = load <2 x i64>, <2 x i64>* %C
1015   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1016   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1017   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1018   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1019   ret <2 x i64> %tmp6
1020 }
1021
1022 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1023 ;CHECK-LABEL: sqdmlal_lane_1s:
1024 ;CHECK: sqdmlal.4s
1025   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1026   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1027   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1028   %prod = extractelement <4 x i32> %prod.vec, i32 0
1029   %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1030   ret i32 %res
1031 }
1032 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1033
1034 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1035 ;CHECK-LABEL: sqdmlsl_lane_1s:
1036 ;CHECK: sqdmlsl.4s
1037   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1038   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1039   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1040   %prod = extractelement <4 x i32> %prod.vec, i32 0
1041   %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1042   ret i32 %res
1043 }
1044 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1045
1046 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1047 ;CHECK-LABEL: sqdmlal_lane_1d:
1048 ;CHECK: sqdmlal.s
1049   %rhs = extractelement <2 x i32> %C, i32 1
1050   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1051   %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1052   ret i64 %res
1053 }
1054 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1055 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1056
1057 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1058 ;CHECK-LABEL: sqdmlsl_lane_1d:
1059 ;CHECK: sqdmlsl.s
1060   %rhs = extractelement <2 x i32> %C, i32 1
1061   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1062   %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1063   ret i64 %res
1064 }
1065 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1066
1067
1068 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1069 ;CHECK-LABEL: umlal_lane_4s:
1070 ;CHECK-NOT: dup
1071 ;CHECK: umlal.4s
1072   %tmp1 = load <4 x i16>, <4 x i16>* %A
1073   %tmp2 = load <4 x i16>, <4 x i16>* %B
1074   %tmp3 = load <4 x i32>, <4 x i32>* %C
1075   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1076   %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1077   %tmp6 = add <4 x i32> %tmp3, %tmp5
1078   ret <4 x i32> %tmp6
1079 }
1080
1081 define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1082 ;CHECK-LABEL: umlal_lane_2d:
1083 ;CHECK-NOT: dup
1084 ;CHECK: umlal.2d
1085   %tmp1 = load <2 x i32>, <2 x i32>* %A
1086   %tmp2 = load <2 x i32>, <2 x i32>* %B
1087   %tmp3 = load <2 x i64>, <2 x i64>* %C
1088   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1089   %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1090   %tmp6 = add <2 x i64> %tmp3, %tmp5
1091   ret <2 x i64> %tmp6
1092 }
1093
1094
1095 define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1096 ;CHECK-LABEL: smlsl_lane_4s:
1097 ;CHECK-NOT: dup
1098 ;CHECK: smlsl.4s
1099   %tmp1 = load <4 x i16>, <4 x i16>* %A
1100   %tmp2 = load <4 x i16>, <4 x i16>* %B
1101   %tmp3 = load <4 x i32>, <4 x i32>* %C
1102   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1103   %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1104   %tmp6 = sub <4 x i32> %tmp3, %tmp5
1105   ret <4 x i32> %tmp6
1106 }
1107
1108 define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1109 ;CHECK-LABEL: smlsl_lane_2d:
1110 ;CHECK-NOT: dup
1111 ;CHECK: smlsl.2d
1112   %tmp1 = load <2 x i32>, <2 x i32>* %A
1113   %tmp2 = load <2 x i32>, <2 x i32>* %B
1114   %tmp3 = load <2 x i64>, <2 x i64>* %C
1115   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1116   %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1117   %tmp6 = sub <2 x i64> %tmp3, %tmp5
1118   ret <2 x i64> %tmp6
1119 }
1120
1121 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1122 ;CHECK-LABEL: sqdmlsl_lane_4s:
1123 ;CHECK-NOT: dup
1124 ;CHECK: sqdmlsl.4s
1125   %tmp1 = load <4 x i16>, <4 x i16>* %A
1126   %tmp2 = load <4 x i16>, <4 x i16>* %B
1127   %tmp3 = load <4 x i32>, <4 x i32>* %C
1128   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1129   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1130   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1131   ret <4 x i32> %tmp6
1132 }
1133
1134 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1135 ;CHECK-LABEL: sqdmlsl_lane_2d:
1136 ;CHECK-NOT: dup
1137 ;CHECK: sqdmlsl.2d
1138   %tmp1 = load <2 x i32>, <2 x i32>* %A
1139   %tmp2 = load <2 x i32>, <2 x i32>* %B
1140   %tmp3 = load <2 x i64>, <2 x i64>* %C
1141   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1142   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1143   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1144   ret <2 x i64> %tmp6
1145 }
1146
1147 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1148 ;CHECK-LABEL: sqdmlsl2_lane_4s:
1149 ;CHECK-NOT: dup
1150 ;CHECK: sqdmlsl.4s
1151   %load1 = load <8 x i16>, <8 x i16>* %A
1152   %load2 = load <8 x i16>, <8 x i16>* %B
1153   %tmp3 = load <4 x i32>, <4 x i32>* %C
1154   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1155   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1156   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1157   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1158   ret <4 x i32> %tmp6
1159 }
1160
1161 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1162 ;CHECK-LABEL: sqdmlsl2_lane_2d:
1163 ;CHECK-NOT: dup
1164 ;CHECK: sqdmlsl.2d
1165   %load1 = load <4 x i32>, <4 x i32>* %A
1166   %load2 = load <4 x i32>, <4 x i32>* %B
1167   %tmp3 = load <2 x i64>, <2 x i64>* %C
1168   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1169   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1170   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1171   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1172   ret <2 x i64> %tmp6
1173 }
1174
1175 define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1176 ;CHECK-LABEL: umlsl_lane_4s:
1177 ;CHECK-NOT: dup
1178 ;CHECK: umlsl.4s
1179   %tmp1 = load <4 x i16>, <4 x i16>* %A
1180   %tmp2 = load <4 x i16>, <4 x i16>* %B
1181   %tmp3 = load <4 x i32>, <4 x i32>* %C
1182   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1183   %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1184   %tmp6 = sub <4 x i32> %tmp3, %tmp5
1185   ret <4 x i32> %tmp6
1186 }
1187
1188 define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1189 ;CHECK-LABEL: umlsl_lane_2d:
1190 ;CHECK-NOT: dup
1191 ;CHECK: umlsl.2d
1192   %tmp1 = load <2 x i32>, <2 x i32>* %A
1193   %tmp2 = load <2 x i32>, <2 x i32>* %B
1194   %tmp3 = load <2 x i64>, <2 x i64>* %C
1195   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1196   %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1197   %tmp6 = sub <2 x i64> %tmp3, %tmp5
1198   ret <2 x i64> %tmp6
1199 }
1200
1201 ; Scalar FMULX
1202 define float @fmulxs(float %a, float %b) nounwind {
1203 ; CHECK-LABEL: fmulxs:
1204 ; CHECK-NEXT: fmulx s0, s0, s1
1205   %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1206 ; CHECK-NEXT: ret
1207   ret float %fmulx.i
1208 }
1209
1210 define double @fmulxd(double %a, double %b) nounwind {
1211 ; CHECK-LABEL: fmulxd:
1212 ; CHECK-NEXT: fmulx d0, d0, d1
1213   %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1214 ; CHECK-NEXT: ret
1215   ret double %fmulx.i
1216 }
1217
1218 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1219 ; CHECK-LABEL: fmulxs_lane:
1220 ; CHECK-NEXT: fmulx.s s0, s0, v1[3]
1221   %b = extractelement <4 x float> %vec, i32 3
1222   %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1223 ; CHECK-NEXT: ret
1224   ret float %fmulx.i
1225 }
1226
1227 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1228 ; CHECK-LABEL: fmulxd_lane:
1229 ; CHECK-NEXT: fmulx.d d0, d0, v1[1]
1230   %b = extractelement <2 x double> %vec, i32 1
1231   %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1232 ; CHECK-NEXT: ret
1233   ret double %fmulx.i
1234 }
1235
1236 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1237 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1238
1239
1240 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1241 ; CHECK-LABEL: smull2_8h_simple:
1242 ; CHECK-NEXT: smull2.8h v0, v0, v1
1243 ; CHECK-NEXT: ret
1244   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1245   %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1246   %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1247   ret <8 x i16> %3
1248 }
1249
1250 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1251 ; CHECK-LABEL: foo0:
1252 ; CHECK: smull2.8h v0, v0, v1
1253   %tmp = bitcast <16 x i8> %a to <2 x i64>
1254   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1255   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1256   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1257   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1258   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1259   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1260   ret <8 x i16> %vmull.i.i
1261 }
1262
1263 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1264 ; CHECK-LABEL: foo1:
1265 ; CHECK: smull2.4s v0, v0, v1
1266   %tmp = bitcast <8 x i16> %a to <2 x i64>
1267   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1268   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1269   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1270   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1271   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1272   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1273   ret <4 x i32> %vmull2.i.i
1274 }
1275
1276 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1277 ; CHECK-LABEL: foo2:
1278 ; CHECK: smull2.2d v0, v0, v1
1279   %tmp = bitcast <4 x i32> %a to <2 x i64>
1280   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1281   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1282   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1283   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1284   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1285   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1286   ret <2 x i64> %vmull2.i.i
1287 }
1288
1289 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1290 ; CHECK-LABEL: foo3:
1291 ; CHECK: umull2.8h v0, v0, v1
1292   %tmp = bitcast <16 x i8> %a to <2 x i64>
1293   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1294   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1295   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1296   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1297   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1298   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1299   ret <8 x i16> %vmull.i.i
1300 }
1301
1302 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1303 ; CHECK-LABEL: foo4:
1304 ; CHECK: umull2.4s v0, v0, v1
1305   %tmp = bitcast <8 x i16> %a to <2 x i64>
1306   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1307   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1308   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1309   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1310   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1311   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1312   ret <4 x i32> %vmull2.i.i
1313 }
1314
1315 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1316 ; CHECK-LABEL: foo5:
1317 ; CHECK: umull2.2d v0, v0, v1
1318   %tmp = bitcast <4 x i32> %a to <2 x i64>
1319   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1320   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1321   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1322   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1323   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1324   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1325   ret <2 x i64> %vmull2.i.i
1326 }
1327
1328 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1329 ; CHECK-LABEL: foo6:
1330 ; CHECK-NEXT: smull2.4s v0, v1, v2[1]
1331 ; CHECK-NEXT: ret
1332 entry:
1333   %0 = bitcast <8 x i16> %b to <2 x i64>
1334   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1335   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1336   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1337   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1338   ret <4 x i32> %vmull2.i
1339 }
1340
1341 define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1342 ; CHECK-LABEL: foo6a:
1343 ; CHECK-NEXT: smull.4s v0, v1, v2[1]
1344 ; CHECK-NEXT: ret
1345 entry:
1346   %0 = bitcast <8 x i16> %b to <2 x i64>
1347   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1348   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1349   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1350   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1351   ret <4 x i32> %vmull2.i
1352 }
1353
1354 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1355 ; CHECK-LABEL: foo7:
1356 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
1357 ; CHECK-NEXT: ret
1358 entry:
1359   %0 = bitcast <4 x i32> %b to <2 x i64>
1360   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1361   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1362   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1363   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1364   ret <2 x i64> %vmull2.i
1365 }
1366
1367 define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1368 ; CHECK-LABEL: foo7a:
1369 ; CHECK-NEXT: smull.2d v0, v1, v2[1]
1370 ; CHECK-NEXT: ret
1371 entry:
1372   %0 = bitcast <4 x i32> %b to <2 x i64>
1373   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1374   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1375   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1376   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1377   ret <2 x i64> %vmull2.i
1378 }
1379
1380
1381 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1382 ; CHECK-LABEL: foo8:
1383 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
1384 ; CHECK-NEXT: ret
1385 entry:
1386   %0 = bitcast <8 x i16> %b to <2 x i64>
1387   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1388   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1389   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1390   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1391   ret <4 x i32> %vmull2.i
1392 }
1393
1394 define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1395 ; CHECK-LABEL: foo8a:
1396 ; CHECK-NEXT: umull.4s v0, v1, v2[1]
1397 ; CHECK-NEXT: ret
1398 entry:
1399   %0 = bitcast <8 x i16> %b to <2 x i64>
1400   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1401   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1402   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1403   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1404   ret <4 x i32> %vmull2.i
1405 }
1406
1407 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1408 ; CHECK-LABEL: foo9:
1409 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
1410 ; CHECK-NEXT: ret
1411 entry:
1412   %0 = bitcast <4 x i32> %b to <2 x i64>
1413   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1414   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1415   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1416   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1417   ret <2 x i64> %vmull2.i
1418 }
1419
1420 define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1421 ; CHECK-LABEL: foo9a:
1422 ; CHECK-NEXT: umull.2d v0, v1, v2[1]
1423 ; CHECK-NEXT: ret
1424 entry:
1425   %0 = bitcast <4 x i32> %b to <2 x i64>
1426   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1427   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1428   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1429   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1430   ret <2 x i64> %vmull2.i
1431 }
1432
1433 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1434 ; CHECK-LABEL: bar0:
1435 ; CHECK: smlal2.8h v0, v1, v2
1436 ; CHECK-NEXT: ret
1437
1438   %tmp = bitcast <16 x i8> %b to <2 x i64>
1439   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1440   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1441   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1442   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1443   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1444   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1445   %add.i = add <8 x i16> %vmull.i.i.i, %a
1446   ret <8 x i16> %add.i
1447 }
1448
1449 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1450 ; CHECK-LABEL: bar1:
1451 ; CHECK: smlal2.4s v0, v1, v2
1452 ; CHECK-NEXT: ret
1453
1454   %tmp = bitcast <8 x i16> %b to <2 x i64>
1455   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1456   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1457   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1458   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1459   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1460   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1461   %add.i = add <4 x i32> %vmull2.i.i.i, %a
1462   ret <4 x i32> %add.i
1463 }
1464
1465 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1466 ; CHECK-LABEL: bar2:
1467 ; CHECK: smlal2.2d v0, v1, v2
1468 ; CHECK-NEXT: ret
1469
1470   %tmp = bitcast <4 x i32> %b to <2 x i64>
1471   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1472   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1473   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1474   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1475   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1476   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1477   %add.i = add <2 x i64> %vmull2.i.i.i, %a
1478   ret <2 x i64> %add.i
1479 }
1480
1481 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1482 ; CHECK-LABEL: bar3:
1483 ; CHECK: umlal2.8h v0, v1, v2
1484 ; CHECK-NEXT: ret
1485
1486   %tmp = bitcast <16 x i8> %b to <2 x i64>
1487   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1488   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1489   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1490   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1491   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1492   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1493   %add.i = add <8 x i16> %vmull.i.i.i, %a
1494   ret <8 x i16> %add.i
1495 }
1496
1497 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1498 ; CHECK-LABEL: bar4:
1499 ; CHECK: umlal2.4s v0, v1, v2
1500 ; CHECK-NEXT: ret
1501
1502   %tmp = bitcast <8 x i16> %b to <2 x i64>
1503   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1504   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1505   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1506   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1507   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1508   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1509   %add.i = add <4 x i32> %vmull2.i.i.i, %a
1510   ret <4 x i32> %add.i
1511 }
1512
1513 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1514 ; CHECK-LABEL: bar5:
1515 ; CHECK: umlal2.2d v0, v1, v2
1516 ; CHECK-NEXT: ret
1517
1518   %tmp = bitcast <4 x i32> %b to <2 x i64>
1519   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1520   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1521   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1522   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1523   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1524   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1525   %add.i = add <2 x i64> %vmull2.i.i.i, %a
1526   ret <2 x i64> %add.i
1527 }
1528
1529 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1530 ; CHECK-LABEL: mlal2_1:
1531 ; CHECK: smlal2.4s v0, v1, v2[3]
1532 ; CHECK-NEXT: ret
1533   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1534   %tmp = bitcast <8 x i16> %b to <2 x i64>
1535   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1536   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1537   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1538   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1539   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1540   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1541   %add = add <4 x i32> %vmull2.i.i, %a
1542   ret <4 x i32> %add
1543 }
1544
1545 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1546 ; CHECK-LABEL: mlal2_2:
1547 ; CHECK: smlal2.2d v0, v1, v2[1]
1548 ; CHECK-NEXT: ret
1549   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1550   %tmp = bitcast <4 x i32> %b to <2 x i64>
1551   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1552   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1553   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1554   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1555   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1556   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1557   %add = add <2 x i64> %vmull2.i.i, %a
1558   ret <2 x i64> %add
1559 }
1560
1561 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1562 ; CHECK-LABEL: mlal2_4:
1563 ; CHECK: umlal2.4s v0, v1, v2[2]
1564 ; CHECK-NEXT: ret
1565
1566   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1567   %tmp = bitcast <8 x i16> %b to <2 x i64>
1568   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1569   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1570   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1571   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1572   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1573   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1574   %add = add <4 x i32> %vmull2.i.i, %a
1575   ret <4 x i32> %add
1576 }
1577
1578 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1579 ; CHECK-LABEL: mlal2_5:
1580 ; CHECK: umlal2.2d v0, v1, v2[0]
1581 ; CHECK-NEXT: ret
1582   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
1583   %tmp = bitcast <4 x i32> %b to <2 x i64>
1584   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1585   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1586   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1587   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1588   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1589   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1590   %add = add <2 x i64> %vmull2.i.i, %a
1591   ret <2 x i64> %add
1592 }
1593
1594 ; rdar://12328502
1595 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
1596 entry:
1597 ; CHECK-LABEL: vmulq_n_f64:
1598 ; CHECK-NOT: dup.2d
1599 ; CHECK: fmul.2d v0, v0, v1[0]
1600   %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
1601   %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
1602   %mul.i = fmul <2 x double> %vecinit1.i, %x
1603   ret <2 x double> %mul.i
1604 }
1605
1606 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
1607 entry:
1608 ; CHECK-LABEL: vmulq_n_f32:
1609 ; CHECK-NOT: dup.4s
1610 ; CHECK: fmul.4s v0, v0, v1[0]
1611   %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
1612   %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
1613   %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
1614   %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
1615   %mul.i = fmul <4 x float> %vecinit3.i, %x
1616   ret <4 x float> %mul.i
1617 }
1618
1619 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
1620 entry:
1621 ; CHECK-LABEL: vmul_n_f32:
1622 ; CHECK-NOT: dup.2s
1623 ; CHECK: fmul.2s v0, v0, v1[0]
1624   %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
1625   %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
1626   %mul.i = fmul <2 x float> %vecinit1.i, %x
1627   ret <2 x float> %mul.i
1628 }
1629
1630 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1631 entry:
1632 ; CHECK: vmla_laneq_s16_test
1633 ; CHECK-NOT: ext
1634 ; CHECK: mla.4h v0, v1, v2[6]
1635 ; CHECK-NEXT: ret
1636   %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1637   %mul = mul <4 x i16> %shuffle, %b
1638   %add = add <4 x i16> %mul, %a
1639   ret <4 x i16> %add
1640 }
1641
1642 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1643 entry:
1644 ; CHECK: vmla_laneq_s32_test
1645 ; CHECK-NOT: ext
1646 ; CHECK: mla.2s v0, v1, v2[3]
1647 ; CHECK-NEXT: ret
1648   %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1649   %mul = mul <2 x i32> %shuffle, %b
1650   %add = add <2 x i32> %mul, %a
1651   ret <2 x i32> %add
1652 }
1653
1654 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1655 entry:
1656 ; CHECK: not_really_vmlaq_laneq_s16_test
1657 ; CHECK-NOT: ext
1658 ; CHECK: mla.8h v0, v1, v2[5]
1659 ; CHECK-NEXT: ret
1660   %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1661   %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1662   %mul = mul <8 x i16> %shuffle2, %b
1663   %add = add <8 x i16> %mul, %a
1664   ret <8 x i16> %add
1665 }
1666
1667 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1668 entry:
1669 ; CHECK: not_really_vmlaq_laneq_s32_test
1670 ; CHECK-NOT: ext
1671 ; CHECK: mla.4s v0, v1, v2[3]
1672 ; CHECK-NEXT: ret
1673   %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1674   %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1675   %mul = mul <4 x i32> %shuffle2, %b
1676   %add = add <4 x i32> %mul, %a
1677   ret <4 x i32> %add
1678 }
1679
1680 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1681 entry:
1682 ; CHECK: vmull_laneq_s16_test
1683 ; CHECK-NOT: ext
1684 ; CHECK: smull.4s v0, v0, v1[6]
1685 ; CHECK-NEXT: ret
1686   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1687   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1688   ret <4 x i32> %vmull2.i
1689 }
1690
1691 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1692 entry:
1693 ; CHECK: vmull_laneq_s32_test
1694 ; CHECK-NOT: ext
1695 ; CHECK: smull.2d v0, v0, v1[2]
1696 ; CHECK-NEXT: ret
1697   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1698   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1699   ret <2 x i64> %vmull2.i
1700 }
1701 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1702 entry:
1703 ; CHECK: vmull_laneq_u16_test
1704 ; CHECK-NOT: ext
1705 ; CHECK: umull.4s v0, v0, v1[6]
1706 ; CHECK-NEXT: ret
1707   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1708   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1709   ret <4 x i32> %vmull2.i
1710 }
1711
1712 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1713 entry:
1714 ; CHECK: vmull_laneq_u32_test
1715 ; CHECK-NOT: ext
1716 ; CHECK: umull.2d v0, v0, v1[2]
1717 ; CHECK-NEXT: ret
1718   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1719   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1720   ret <2 x i64> %vmull2.i
1721 }
1722
1723 define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1724 entry:
1725 ; CHECK: vmull_low_n_s16_test
1726 ; CHECK-NOT: ext
1727 ; CHECK: smull.4s
1728 ; CHECK-NEXT: ret
1729   %conv = trunc i32 %d to i16
1730   %0 = bitcast <8 x i16> %b to <2 x i64>
1731   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1732   %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1733   %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1734   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1735   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1736   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1737   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1738   ret <4 x i32> %vmull2.i.i
1739 }
1740
1741 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1742 entry:
1743 ; CHECK: vmull_high_n_s16_test
1744 ; CHECK-NOT: ext
1745 ; CHECK: smull2.4s
1746 ; CHECK-NEXT: ret
1747   %conv = trunc i32 %d to i16
1748   %0 = bitcast <8 x i16> %b to <2 x i64>
1749   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1750   %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1751   %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1752   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1753   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1754   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1755   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1756   ret <4 x i32> %vmull2.i.i
1757 }
1758
1759 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1760 entry:
1761 ; CHECK: vmull_high_n_s32_test
1762 ; CHECK-NOT: ext
1763 ; CHECK: smull2.2d
1764 ; CHECK-NEXT: ret
1765   %0 = bitcast <4 x i32> %b to <2 x i64>
1766   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1767   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1768   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1769   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1770   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1771   ret <2 x i64> %vmull2.i.i
1772 }
1773
1774 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1775 entry:
1776 ; CHECK: vmull_high_n_u16_test
1777 ; CHECK-NOT: ext
1778 ; CHECK: umull2.4s
1779 ; CHECK-NEXT: ret
1780   %conv = trunc i32 %d to i16
1781   %0 = bitcast <8 x i16> %b to <2 x i64>
1782   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1783   %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1784   %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1785   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1786   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1787   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1788   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1789   ret <4 x i32> %vmull2.i.i
1790 }
1791
1792 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1793 entry:
1794 ; CHECK: vmull_high_n_u32_test
1795 ; CHECK-NOT: ext
1796 ; CHECK: umull2.2d
1797 ; CHECK-NEXT: ret
1798   %0 = bitcast <4 x i32> %b to <2 x i64>
1799   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1800   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1801   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1802   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1803   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1804   ret <2 x i64> %vmull2.i.i
1805 }
1806
1807 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
1808 ; CHECK-LABEL: vmul_built_dup_test:
1809 ; CHECK-NOT: ins
1810 ; CHECK-NOT: dup
1811 ; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
1812   %vget_lane = extractelement <4 x i32> %b, i32 1
1813   %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
1814   %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
1815   %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
1816   %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
1817   %prod = mul <4 x i32> %a, %vecinit3.i
1818   ret <4 x i32> %prod
1819 }
1820
1821 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
1822 ; CHECK-LABEL: vmul_built_dup_fromsmall_test:
1823 ; CHECK-NOT: ins
1824 ; CHECK-NOT: dup
1825 ; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
1826   %vget_lane = extractelement <4 x i16> %b, i32 3
1827   %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1828   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1829   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1830   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1831   %prod = mul <4 x i16> %a, %vecinit3.i
1832   ret <4 x i16> %prod
1833 }
1834
1835 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
1836 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
1837 ; CHECK-NOT: ins
1838 ; CHECK-NOT: dup
1839 ; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
1840   %vget_lane = extractelement <4 x i16> %b, i32 0
1841   %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
1842   %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
1843   %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1844   %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1845   %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
1846   %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
1847   %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
1848   %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
1849   %prod = mul <8 x i16> %a, %vecinit7.i
1850   ret <8 x i16> %prod
1851 }
1852
1853 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
1854 ; CHECK-LABEL: mull_from_two_extracts:
1855 ; CHECK-NOT: ext
1856 ; CHECK: sqdmull2.2d
1857
1858   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1859   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1860
1861   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1862   ret <2 x i64> %res
1863 }
1864
1865 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1866 ; CHECK-LABEL: mlal_from_two_extracts:
1867 ; CHECK-NOT: ext
1868 ; CHECK: sqdmlal2.2d
1869
1870   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1871   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1872
1873   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1874   %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1875   ret <2 x i64> %sum
1876 }
1877
1878 define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
1879 ; CHECK-LABEL: mull_from_extract_dup_low:
1880 ; CHECK-NOT: ext
1881 ; CHECK: sqdmull.2d
1882   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1883   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1884
1885   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1886
1887   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1888   ret <2 x i64> %res
1889 }
1890
1891 define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
1892 ; CHECK-LABEL: mull_from_extract_dup_high:
1893 ; CHECK-NOT: ext
1894 ; CHECK: sqdmull2.2d
1895   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1896   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1897
1898   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1899
1900   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1901   ret <2 x i64> %res
1902 }
1903
1904 define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
1905 ; CHECK-LABEL: pmull_from_extract_dup_low:
1906 ; CHECK-NOT: ext
1907 ; CHECK: pmull.8h
1908   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
1909   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1910
1911   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1912
1913   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
1914   ret <8 x i16> %res
1915 }
1916
1917 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
1918 ; CHECK-LABEL: pmull_from_extract_dup_high:
1919 ; CHECK-NOT: ext
1920 ; CHECK: pmull2.8h
1921   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
1922   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1923
1924   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1925
1926   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
1927   ret <8 x i16> %res
1928 }
1929
1930 define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
1931 ; CHECK-LABEL: pmull_from_extract_duplane_low:
1932 ; CHECK-NOT: ext
1933 ; CHECK: pmull.8h
1934
1935   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1936   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1937
1938   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
1939   ret <8 x i16> %res
1940 }
1941
1942 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
1943 ; CHECK-LABEL: pmull_from_extract_duplane_high:
1944 ; CHECK-NOT: ext
1945 ; CHECK: pmull2.8h
1946
1947   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1948   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1949
1950   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
1951   ret <8 x i16> %res
1952 }
1953
1954 define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
1955 ; CHECK-LABEL: sqdmull_from_extract_duplane_low:
1956 ; CHECK-NOT: ext
1957 ; CHECK: sqdmull.2d
1958
1959   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1960   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1961
1962   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1963   ret <2 x i64> %res
1964 }
1965
1966 define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
1967 ; CHECK-LABEL: sqdmull_from_extract_duplane_high:
1968 ; CHECK-NOT: ext
1969 ; CHECK: sqdmull2.2d
1970
1971   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1972   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1973
1974   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1975   ret <2 x i64> %res
1976 }
1977
1978 define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1979 ; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
1980 ; CHECK-NOT: ext
1981 ; CHECK: sqdmlal.2d
1982
1983   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1984   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1985
1986   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1987   %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1988   ret <2 x i64> %sum
1989 }
1990
1991 define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1992 ; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
1993 ; CHECK-NOT: ext
1994 ; CHECK: sqdmlal2.2d
1995
1996   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1997   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1998
1999   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2000   %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2001   ret <2 x i64> %sum
2002 }
2003
2004 define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2005 ; CHECK-LABEL: umlal_from_extract_duplane_low:
2006 ; CHECK-NOT: ext
2007 ; CHECK: umlal.2d
2008
2009   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2010   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2011
2012   %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2013   %sum = add <2 x i64> %accum, %res
2014   ret <2 x i64> %sum
2015 }
2016
2017 define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2018 ; CHECK-LABEL: umlal_from_extract_duplane_high:
2019 ; CHECK-NOT: ext
2020 ; CHECK: umlal2.2d
2021
2022   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2023   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2024
2025   %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2026   %sum = add <2 x i64> %accum, %res
2027   ret <2 x i64> %sum
2028 }
2029
2030 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2031 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
2032 ; CHECK: fmla.s s0, s1, v2[3]
2033   %rhs = extractelement <4 x float> %rvec, i32 3
2034   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2035   ret float %res
2036 }
2037
2038 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2039 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
2040 ; CHECK: fmla.s s0, s1, v2[1]
2041   %rhs = extractelement <2 x float> %rvec, i32 1
2042   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2043   ret float %res
2044 }
2045
2046 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2047 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
2048 ; CHECK: fmls.s s0, s1, v2[3]
2049   %rhs.scal = extractelement <4 x float> %rvec, i32 3
2050   %rhs = fsub float -0.0, %rhs.scal
2051   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2052   ret float %res
2053 }
2054
2055 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2056 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
2057 ; CHECK: fmls.s s0, s1, v2[1]
2058   %rhs.scal = extractelement <2 x float> %rvec, i32 1
2059   %rhs = fsub float -0.0, %rhs.scal
2060   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2061   ret float %res
2062 }
2063
2064 declare float @llvm.fma.f32(float, float, float)
2065
2066 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2067 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
2068 ; CHECK: fmla.d d0, d1, v2[1]
2069   %rhs = extractelement <2 x double> %rvec, i32 1
2070   %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2071   ret double %res
2072 }
2073
2074 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2075 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
2076 ; CHECK: fmls.d d0, d1, v2[1]
2077   %rhs.scal = extractelement <2 x double> %rvec, i32 1
2078   %rhs = fsub double -0.0, %rhs.scal
2079   %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2080   ret double %res
2081 }
2082
2083 declare double @llvm.fma.f64(double, double, double)
2084
2085 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
2086 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
2087 ; CHECK: fmls.2s v0, v1, v2[3]
2088   %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2089   %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2090   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2091   ret <2 x float> %res
2092 }
2093
2094 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
2095 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
2096 ; CHECK: fmls.2s v0, v1, v2[1]
2097   %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2098   %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2099   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2100   ret <2 x float> %res
2101 }
2102
2103 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
2104 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
2105 ; CHECK: fmls.4s v0, v1, v2[3]
2106   %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2107   %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2108   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2109   ret <4 x float> %res
2110 }
2111
2112 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
2113 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
2114 ; CHECK: fmls.4s v0, v1, v2[1]
2115   %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2116   %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2117   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2118   ret <4 x float> %res
2119 }
2120
2121 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
2122 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
2123 ; CHECK: fmls.2d v0, v1, v2[1]
2124   %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
2125   %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2126   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
2127   ret <2 x double> %res
2128 }
2129
2130 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2131 ; CHECK-LABEL: test_fmul_v1f64:
2132 ; CHECK: fmul
2133   %prod = fmul <1 x double> %L, %R
2134   ret <1 x double> %prod
2135 }
2136
2137 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2138 ; CHECK-LABEL: test_fdiv_v1f64:
2139 ; CHECK-LABEL: fdiv
2140   %prod = fdiv <1 x double> %L, %R
2141   ret <1 x double> %prod
2142 }
2143
2144 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2145 ;CHECK-LABEL: sqdmlal_d:
2146 ;CHECK: sqdmlal
2147   %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2148   %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2149   ret i64 %tmp5
2150 }
2151
2152 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2153 ;CHECK-LABEL: sqdmlsl_d:
2154 ;CHECK: sqdmlsl
2155   %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2156   %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2157   ret i64 %tmp5
2158 }
2159
2160 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2161 ; CHECK-LABEL: test_pmull_64:
2162 ; CHECK: pmull.1q
2163   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2164   ret <16 x i8> %val
2165 }
2166
2167 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2168 ; CHECK-LABEL: test_pmull_high_64:
2169 ; CHECK: pmull2.1q
2170   %l_hi = extractelement <2 x i64> %l, i32 1
2171   %r_hi = extractelement <2 x i64> %r, i32 1
2172   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2173   ret <16 x i8> %val
2174 }
2175
2176 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2177
2178 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2179 ; CHECK-LABEL: test_mul_v1i64:
2180 ; CHECK: mul
2181   %prod = mul <1 x i64> %lhs, %rhs
2182   ret <1 x i64> %prod
2183 }