test/CodeGen/AArch64/arm64-vadd.ll

   1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
   2
   3 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   4 ;CHECK-LABEL: addhn8b:
   5 ;CHECK: addhn.8b
   6         %tmp1 = load <8 x i16>, <8 x i16>* %A
   7         %tmp2 = load <8 x i16>, <8 x i16>* %B
   8         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
   9         ret <8 x i8> %tmp3
  10 }
  11
  12 define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
  13 ;CHECK-LABEL: addhn4h:
  14 ;CHECK: addhn.4h
  15         %tmp1 = load <4 x i32>, <4 x i32>* %A
  16         %tmp2 = load <4 x i32>, <4 x i32>* %B
  17         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
  18         ret <4 x i16> %tmp3
  19 }
  20
  21 define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
  22 ;CHECK-LABEL: addhn2s:
  23 ;CHECK: addhn.2s
  24         %tmp1 = load <2 x i64>, <2 x i64>* %A
  25         %tmp2 = load <2 x i64>, <2 x i64>* %B
  26         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
  27         ret <2 x i32> %tmp3
  28 }
  29
  30 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
  31 ;CHECK-LABEL: addhn2_16b:
  32 ;CHECK: addhn.8b
  33 ;CHECK-NEXT: addhn2.16b
  34   %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
  35   %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
  36   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  37   ret <16 x i8> %res
  38 }
  39
  40 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
  41 ;CHECK-LABEL: addhn2_8h:
  42 ;CHECK: addhn.4h
  43 ;CHECK-NEXT: addhn2.8h
  44   %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
  45   %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
  46   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  47   ret <8 x i16> %res
  48 }
  49
  50 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
  51 ;CHECK-LABEL: addhn2_4s:
  52 ;CHECK: addhn.2s
  53 ;CHECK-NEXT: addhn2.4s
  54   %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
  55   %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
  56   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  57   ret <4 x i32> %res
  58 }
  59
  60 declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
  61 declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
  62 declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
  63
  64
  65 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
  66 ;CHECK-LABEL: raddhn8b:
  67 ;CHECK: raddhn.8b
  68         %tmp1 = load <8 x i16>, <8 x i16>* %A
  69         %tmp2 = load <8 x i16>, <8 x i16>* %B
  70         %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
  71         ret <8 x i8> %tmp3
  72 }
  73
  74 define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
  75 ;CHECK-LABEL: raddhn4h:
  76 ;CHECK: raddhn.4h
  77         %tmp1 = load <4 x i32>, <4 x i32>* %A
  78         %tmp2 = load <4 x i32>, <4 x i32>* %B
  79         %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
  80         ret <4 x i16> %tmp3
  81 }
  82
  83 define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
  84 ;CHECK-LABEL: raddhn2s:
  85 ;CHECK: raddhn.2s
  86         %tmp1 = load <2 x i64>, <2 x i64>* %A
  87         %tmp2 = load <2 x i64>, <2 x i64>* %B
  88         %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
  89         ret <2 x i32> %tmp3
  90 }
  91
  92 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
  93 ;CHECK-LABEL: raddhn2_16b:
  94 ;CHECK: raddhn.8b
  95 ;CHECK-NEXT: raddhn2.16b
  96   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
  97   %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
  98   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  99   ret <16 x i8> %res
 100 }
 101
 102 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 103 ;CHECK-LABEL: raddhn2_8h:
 104 ;CHECK: raddhn.4h
 105 ;CHECK-NEXT: raddhn2.8h
 106   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
 107   %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
 108   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 109   ret <8 x i16> %res
 110 }
 111
 112 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
 113 ;CHECK-LABEL: raddhn2_4s:
 114 ;CHECK: raddhn.2s
 115 ;CHECK-NEXT: raddhn2.4s
 116   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
 117   %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
 118   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 119   ret <4 x i32> %res
 120 }
 121
 122 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
 123 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
 124 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 125
 126 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 127 ;CHECK-LABEL: saddl8h:
 128 ;CHECK: saddl.8h
 129         %tmp1 = load <8 x i8>, <8 x i8>* %A
 130         %tmp2 = load <8 x i8>, <8 x i8>* %B
 131   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
 132   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
 133   %tmp5 = add <8 x i16> %tmp3, %tmp4
 134         ret <8 x i16> %tmp5
 135 }
 136
 137 define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 138 ;CHECK-LABEL: saddl4s:
 139 ;CHECK: saddl.4s
 140         %tmp1 = load <4 x i16>, <4 x i16>* %A
 141         %tmp2 = load <4 x i16>, <4 x i16>* %B
 142   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
 143   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
 144   %tmp5 = add <4 x i32> %tmp3, %tmp4
 145         ret <4 x i32> %tmp5
 146 }
 147
 148 define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 149 ;CHECK-LABEL: saddl2d:
 150 ;CHECK: saddl.2d
 151         %tmp1 = load <2 x i32>, <2 x i32>* %A
 152         %tmp2 = load <2 x i32>, <2 x i32>* %B
 153   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
 154   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
 155   %tmp5 = add <2 x i64> %tmp3, %tmp4
 156         ret <2 x i64> %tmp5
 157 }
 158
 159 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 160 ; CHECK-LABEL: saddl2_8h:
 161 ; CHECK-NEXT: saddl2.8h v0, v0, v1
 162 ; CHECK-NEXT: ret
 163   %tmp = bitcast <16 x i8> %a to <2 x i64>
 164   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 165   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
 166   %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
 167   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
 168   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 169   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
 170   %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
 171   %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
 172   ret <8 x i16> %add.i
 173 }
 174
 175 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 176 ; CHECK-LABEL: saddl2_4s:
 177 ; CHECK-NEXT: saddl2.4s v0, v0, v1
 178 ; CHECK-NEXT: ret
 179   %tmp = bitcast <8 x i16> %a to <2 x i64>
 180   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 181   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
 182   %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
 183   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
 184   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 185   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
 186   %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
 187   %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
 188   ret <4 x i32> %add.i
 189 }
 190
 191 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 192 ; CHECK-LABEL: saddl2_2d:
 193 ; CHECK-NEXT: saddl2.2d v0, v0, v1
 194 ; CHECK-NEXT: ret
 195   %tmp = bitcast <4 x i32> %a to <2 x i64>
 196   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 197   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
 198   %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
 199   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
 200   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 201   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
 202   %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
 203   %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
 204   ret <2 x i64> %add.i
 205 }
 206
 207 define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 208 ;CHECK-LABEL: uaddl8h:
 209 ;CHECK: uaddl.8h
 210   %tmp1 = load <8 x i8>, <8 x i8>* %A
 211   %tmp2 = load <8 x i8>, <8 x i8>* %B
 212   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
 213   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
 214   %tmp5 = add <8 x i16> %tmp3, %tmp4
 215   ret <8 x i16> %tmp5
 216 }
 217
 218 define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 219 ;CHECK-LABEL: uaddl4s:
 220 ;CHECK: uaddl.4s
 221   %tmp1 = load <4 x i16>, <4 x i16>* %A
 222   %tmp2 = load <4 x i16>, <4 x i16>* %B
 223   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
 224   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
 225   %tmp5 = add <4 x i32> %tmp3, %tmp4
 226   ret <4 x i32> %tmp5
 227 }
 228
 229 define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 230 ;CHECK-LABEL: uaddl2d:
 231 ;CHECK: uaddl.2d
 232   %tmp1 = load <2 x i32>, <2 x i32>* %A
 233   %tmp2 = load <2 x i32>, <2 x i32>* %B
 234   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
 235   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
 236   %tmp5 = add <2 x i64> %tmp3, %tmp4
 237   ret <2 x i64> %tmp5
 238 }
 239
 240
 241 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 242 ; CHECK-LABEL: uaddl2_8h:
 243 ; CHECK-NEXT: uaddl2.8h v0, v0, v1
 244 ; CHECK-NEXT: ret
 245   %tmp = bitcast <16 x i8> %a to <2 x i64>
 246   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 247   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
 248   %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
 249   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
 250   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 251   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
 252   %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
 253   %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
 254   ret <8 x i16> %add.i
 255 }
 256
 257 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 258 ; CHECK-LABEL: uaddl2_4s:
 259 ; CHECK-NEXT: uaddl2.4s v0, v0, v1
 260 ; CHECK-NEXT: ret
 261   %tmp = bitcast <8 x i16> %a to <2 x i64>
 262   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 263   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
 264   %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
 265   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
 266   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 267   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
 268   %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
 269   %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
 270   ret <4 x i32> %add.i
 271 }
 272
 273 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 274 ; CHECK-LABEL: uaddl2_2d:
 275 ; CHECK-NEXT: uaddl2.2d v0, v0, v1
 276 ; CHECK-NEXT: ret
 277   %tmp = bitcast <4 x i32> %a to <2 x i64>
 278   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
 279   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
 280   %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
 281   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
 282   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
 283   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
 284   %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
 285   %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
 286   ret <2 x i64> %add.i
 287 }
 288
 289 define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
 290 ;CHECK-LABEL: uaddw8h:
 291 ;CHECK: uaddw.8h
 292         %tmp1 = load <8 x i16>, <8 x i16>* %A
 293         %tmp2 = load <8 x i8>, <8 x i8>* %B
 294   %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
 295   %tmp4 = add <8 x i16> %tmp1, %tmp3
 296         ret <8 x i16> %tmp4
 297 }
 298
 299 define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
 300 ;CHECK-LABEL: uaddw4s:
 301 ;CHECK: uaddw.4s
 302         %tmp1 = load <4 x i32>, <4 x i32>* %A
 303         %tmp2 = load <4 x i16>, <4 x i16>* %B
 304   %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
 305   %tmp4 = add <4 x i32> %tmp1, %tmp3
 306         ret <4 x i32> %tmp4
 307 }
 308
 309 define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 310 ;CHECK-LABEL: uaddw2d:
 311 ;CHECK: uaddw.2d
 312         %tmp1 = load <2 x i64>, <2 x i64>* %A
 313         %tmp2 = load <2 x i32>, <2 x i32>* %B
 314   %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
 315   %tmp4 = add <2 x i64> %tmp1, %tmp3
 316         ret <2 x i64> %tmp4
 317 }
 318
 319 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 320 ;CHECK-LABEL: uaddw2_8h:
 321 ;CHECK: uaddw.8h
 322         %tmp1 = load <8 x i16>, <8 x i16>* %A
 323
 324         %tmp2 = load <16 x i8>, <16 x i8>* %B
 325         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 326         %ext2 = zext <8 x i8> %high2 to <8 x i16>
 327
 328         %res = add <8 x i16> %tmp1, %ext2
 329         ret <8 x i16> %res
 330 }
 331
 332 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 333 ;CHECK-LABEL: uaddw2_4s:
 334 ;CHECK: uaddw.4s
 335         %tmp1 = load <4 x i32>, <4 x i32>* %A
 336
 337         %tmp2 = load <8 x i16>, <8 x i16>* %B
 338         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 339         %ext2 = zext <4 x i16> %high2 to <4 x i32>
 340
 341         %res = add <4 x i32> %tmp1, %ext2
 342         ret <4 x i32> %res
 343 }
 344
 345 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 346 ;CHECK-LABEL: uaddw2_2d:
 347 ;CHECK: uaddw.2d
 348         %tmp1 = load <2 x i64>, <2 x i64>* %A
 349
 350         %tmp2 = load <4 x i32>, <4 x i32>* %B
 351         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 352         %ext2 = zext <2 x i32> %high2 to <2 x i64>
 353
 354         %res = add <2 x i64> %tmp1, %ext2
 355         ret <2 x i64> %res
 356 }
 357
 358 define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
 359 ;CHECK-LABEL: saddw8h:
 360 ;CHECK: saddw.8h
 361         %tmp1 = load <8 x i16>, <8 x i16>* %A
 362         %tmp2 = load <8 x i8>, <8 x i8>* %B
 363         %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
 364         %tmp4 = add <8 x i16> %tmp1, %tmp3
 365         ret <8 x i16> %tmp4
 366 }
 367
 368 define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
 369 ;CHECK-LABEL: saddw4s:
 370 ;CHECK: saddw.4s
 371         %tmp1 = load <4 x i32>, <4 x i32>* %A
 372         %tmp2 = load <4 x i16>, <4 x i16>* %B
 373         %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
 374         %tmp4 = add <4 x i32> %tmp1, %tmp3
 375         ret <4 x i32> %tmp4
 376 }
 377
 378 define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 379 ;CHECK-LABEL: saddw2d:
 380 ;CHECK: saddw.2d
 381         %tmp1 = load <2 x i64>, <2 x i64>* %A
 382         %tmp2 = load <2 x i32>, <2 x i32>* %B
 383         %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
 384         %tmp4 = add <2 x i64> %tmp1, %tmp3
 385         ret <2 x i64> %tmp4
 386 }
 387
 388 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 389 ;CHECK-LABEL: saddw2_8h:
 390 ;CHECK: saddw.8h
 391         %tmp1 = load <8 x i16>, <8 x i16>* %A
 392
 393         %tmp2 = load <16 x i8>, <16 x i8>* %B
 394         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 395         %ext2 = sext <8 x i8> %high2 to <8 x i16>
 396
 397         %res = add <8 x i16> %tmp1, %ext2
 398         ret <8 x i16> %res
 399 }
 400
 401 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 402 ;CHECK-LABEL: saddw2_4s:
 403 ;CHECK: saddw.4s
 404         %tmp1 = load <4 x i32>, <4 x i32>* %A
 405
 406         %tmp2 = load <8 x i16>, <8 x i16>* %B
 407         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 408         %ext2 = sext <4 x i16> %high2 to <4 x i32>
 409
 410         %res = add <4 x i32> %tmp1, %ext2
 411         ret <4 x i32> %res
 412 }
 413
 414 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 415 ;CHECK-LABEL: saddw2_2d:
 416 ;CHECK: saddw.2d
 417         %tmp1 = load <2 x i64>, <2 x i64>* %A
 418
 419         %tmp2 = load <4 x i32>, <4 x i32>* %B
 420         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 421         %ext2 = sext <2 x i32> %high2 to <2 x i64>
 422
 423         %res = add <2 x i64> %tmp1, %ext2
 424         ret <2 x i64> %res
 425 }
 426
 427 define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
 428 ;CHECK-LABEL: saddlp4h:
 429 ;CHECK: saddlp.4h
 430         %tmp1 = load <8 x i8>, <8 x i8>* %A
 431         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
 432         ret <4 x i16> %tmp3
 433 }
 434
 435 define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
 436 ;CHECK-LABEL: saddlp2s:
 437 ;CHECK: saddlp.2s
 438         %tmp1 = load <4 x i16>, <4 x i16>* %A
 439         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
 440         ret <2 x i32> %tmp3
 441 }
 442
 443 define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
 444 ;CHECK-LABEL: saddlp1d:
 445 ;CHECK: saddlp.1d
 446         %tmp1 = load <2 x i32>, <2 x i32>* %A
 447         %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
 448         ret <1 x i64> %tmp3
 449 }
 450
 451 define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
 452 ;CHECK-LABEL: saddlp8h:
 453 ;CHECK: saddlp.8h
 454         %tmp1 = load <16 x i8>, <16 x i8>* %A
 455         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
 456         ret <8 x i16> %tmp3
 457 }
 458
 459 define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
 460 ;CHECK-LABEL: saddlp4s:
 461 ;CHECK: saddlp.4s
 462         %tmp1 = load <8 x i16>, <8 x i16>* %A
 463         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
 464         ret <4 x i32> %tmp3
 465 }
 466
 467 define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
 468 ;CHECK-LABEL: saddlp2d:
 469 ;CHECK: saddlp.2d
 470         %tmp1 = load <4 x i32>, <4 x i32>* %A
 471         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
 472         ret <2 x i64> %tmp3
 473 }
 474
 475 declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
 476 declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
 477 declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 478
 479 declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
 480 declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
 481 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 482
 483 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
 484 ;CHECK-LABEL: uaddlp4h:
 485 ;CHECK: uaddlp.4h
 486         %tmp1 = load <8 x i8>, <8 x i8>* %A
 487         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
 488         ret <4 x i16> %tmp3
 489 }
 490
 491 define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
 492 ;CHECK-LABEL: uaddlp2s:
 493 ;CHECK: uaddlp.2s
 494         %tmp1 = load <4 x i16>, <4 x i16>* %A
 495         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
 496         ret <2 x i32> %tmp3
 497 }
 498
 499 define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
 500 ;CHECK-LABEL: uaddlp1d:
 501 ;CHECK: uaddlp.1d
 502         %tmp1 = load <2 x i32>, <2 x i32>* %A
 503         %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
 504         ret <1 x i64> %tmp3
 505 }
 506
 507 define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
 508 ;CHECK-LABEL: uaddlp8h:
 509 ;CHECK: uaddlp.8h
 510         %tmp1 = load <16 x i8>, <16 x i8>* %A
 511         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
 512         ret <8 x i16> %tmp3
 513 }
 514
 515 define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
 516 ;CHECK-LABEL: uaddlp4s:
 517 ;CHECK: uaddlp.4s
 518         %tmp1 = load <8 x i16>, <8 x i16>* %A
 519         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
 520         ret <4 x i32> %tmp3
 521 }
 522
 523 define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
 524 ;CHECK-LABEL: uaddlp2d:
 525 ;CHECK: uaddlp.2d
 526         %tmp1 = load <4 x i32>, <4 x i32>* %A
 527         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
 528         ret <2 x i64> %tmp3
 529 }
 530
 531 declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
 532 declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
 533 declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 534
 535 declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
 536 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
 537 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 538
 539 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 540 ;CHECK-LABEL: sadalp4h:
 541 ;CHECK: sadalp.4h
 542         %tmp1 = load <8 x i8>, <8 x i8>* %A
 543         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
 544         %tmp4 = load <4 x i16>, <4 x i16>* %B
 545         %tmp5 = add <4 x i16> %tmp3, %tmp4
 546         ret <4 x i16> %tmp5
 547 }
 548
 549 define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 550 ;CHECK-LABEL: sadalp2s:
 551 ;CHECK: sadalp.2s
 552         %tmp1 = load <4 x i16>, <4 x i16>* %A
 553         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
 554         %tmp4 = load <2 x i32>, <2 x i32>* %B
 555         %tmp5 = add <2 x i32> %tmp3, %tmp4
 556         ret <2 x i32> %tmp5
 557 }
 558
 559 define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 560 ;CHECK-LABEL: sadalp8h:
 561 ;CHECK: sadalp.8h
 562         %tmp1 = load <16 x i8>, <16 x i8>* %A
 563         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
 564         %tmp4 = load <8 x i16>, <8 x i16>* %B
 565         %tmp5 = add <8 x i16> %tmp3, %tmp4
 566         ret <8 x i16> %tmp5
 567 }
 568
 569 define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 570 ;CHECK-LABEL: sadalp4s:
 571 ;CHECK: sadalp.4s
 572         %tmp1 = load <8 x i16>, <8 x i16>* %A
 573         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
 574         %tmp4 = load <4 x i32>, <4 x i32>* %B
 575         %tmp5 = add <4 x i32> %tmp3, %tmp4
 576         ret <4 x i32> %tmp5
 577 }
 578
 579 define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 580 ;CHECK-LABEL: sadalp2d:
 581 ;CHECK: sadalp.2d
 582         %tmp1 = load <4 x i32>, <4 x i32>* %A
 583         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
 584         %tmp4 = load <2 x i64>, <2 x i64>* %B
 585         %tmp5 = add <2 x i64> %tmp3, %tmp4
 586         ret <2 x i64> %tmp5
 587 }
 588
 589 define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 590 ;CHECK-LABEL: uadalp4h:
 591 ;CHECK: uadalp.4h
 592         %tmp1 = load <8 x i8>, <8 x i8>* %A
 593         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
 594         %tmp4 = load <4 x i16>, <4 x i16>* %B
 595         %tmp5 = add <4 x i16> %tmp3, %tmp4
 596         ret <4 x i16> %tmp5
 597 }
 598
 599 define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 600 ;CHECK-LABEL: uadalp2s:
 601 ;CHECK: uadalp.2s
 602         %tmp1 = load <4 x i16>, <4 x i16>* %A
 603         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
 604         %tmp4 = load <2 x i32>, <2 x i32>* %B
 605         %tmp5 = add <2 x i32> %tmp3, %tmp4
 606         ret <2 x i32> %tmp5
 607 }
 608
 609 define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 610 ;CHECK-LABEL: uadalp8h:
 611 ;CHECK: uadalp.8h
 612         %tmp1 = load <16 x i8>, <16 x i8>* %A
 613         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
 614         %tmp4 = load <8 x i16>, <8 x i16>* %B
 615         %tmp5 = add <8 x i16> %tmp3, %tmp4
 616         ret <8 x i16> %tmp5
 617 }
 618
 619 define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 620 ;CHECK-LABEL: uadalp4s:
 621 ;CHECK: uadalp.4s
 622         %tmp1 = load <8 x i16>, <8 x i16>* %A
 623         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
 624         %tmp4 = load <4 x i32>, <4 x i32>* %B
 625         %tmp5 = add <4 x i32> %tmp3, %tmp4
 626         ret <4 x i32> %tmp5
 627 }
 628
 629 define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 630 ;CHECK-LABEL: uadalp2d:
 631 ;CHECK: uadalp.2d
 632         %tmp1 = load <4 x i32>, <4 x i32>* %A
 633         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
 634         %tmp4 = load <2 x i64>, <2 x i64>* %B
 635         %tmp5 = add <2 x i64> %tmp3, %tmp4
 636         ret <2 x i64> %tmp5
 637 }
 638
 639 define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 640 ;CHECK-LABEL: addp_8b:
 641 ;CHECK: addp.8b
 642         %tmp1 = load <8 x i8>, <8 x i8>* %A
 643         %tmp2 = load <8 x i8>, <8 x i8>* %B
 644         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 645         ret <8 x i8> %tmp3
 646 }
 647
 648 define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 649 ;CHECK-LABEL: addp_16b:
 650 ;CHECK: addp.16b
 651         %tmp1 = load <16 x i8>, <16 x i8>* %A
 652         %tmp2 = load <16 x i8>, <16 x i8>* %B
 653         %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 654         ret <16 x i8> %tmp3
 655 }
 656
 657 define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 658 ;CHECK-LABEL: addp_4h:
 659 ;CHECK: addp.4h
 660         %tmp1 = load <4 x i16>, <4 x i16>* %A
 661         %tmp2 = load <4 x i16>, <4 x i16>* %B
 662         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 663         ret <4 x i16> %tmp3
 664 }
 665
 666 define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 667 ;CHECK-LABEL: addp_8h:
 668 ;CHECK: addp.8h
 669         %tmp1 = load <8 x i16>, <8 x i16>* %A
 670         %tmp2 = load <8 x i16>, <8 x i16>* %B
 671         %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 672         ret <8 x i16> %tmp3
 673 }
 674
 675 define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 676 ;CHECK-LABEL: addp_2s:
 677 ;CHECK: addp.2s
 678         %tmp1 = load <2 x i32>, <2 x i32>* %A
 679         %tmp2 = load <2 x i32>, <2 x i32>* %B
 680         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 681         ret <2 x i32> %tmp3
 682 }
 683
 684 define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 685 ;CHECK-LABEL: addp_4s:
 686 ;CHECK: addp.4s
 687         %tmp1 = load <4 x i32>, <4 x i32>* %A
 688         %tmp2 = load <4 x i32>, <4 x i32>* %B
 689         %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 690         ret <4 x i32> %tmp3
 691 }
 692
 693 define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 694 ;CHECK-LABEL: addp_2d:
 695 ;CHECK: addp.2d
 696         %tmp1 = load <2 x i64>, <2 x i64>* %A
 697         %tmp2 = load <2 x i64>, <2 x i64>* %B
 698         %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 699         ret <2 x i64> %tmp3
 700 }
 701
 702 declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
 703 declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
 704 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 705 declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 706 declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 707 declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 708 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 709
 710 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 711 ;CHECK-LABEL: faddp_2s:
 712 ;CHECK: faddp.2s
 713         %tmp1 = load <2 x float>, <2 x float>* %A
 714         %tmp2 = load <2 x float>, <2 x float>* %B
 715         %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 716         ret <2 x float> %tmp3
 717 }
 718
 719 define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 720 ;CHECK-LABEL: faddp_4s:
 721 ;CHECK: faddp.4s
 722         %tmp1 = load <4 x float>, <4 x float>* %A
 723         %tmp2 = load <4 x float>, <4 x float>* %B
 724         %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 725         ret <4 x float> %tmp3
 726 }
 727
 728 define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 729 ;CHECK-LABEL: faddp_2d:
 730 ;CHECK: faddp.2d
 731         %tmp1 = load <2 x double>, <2 x double>* %A
 732         %tmp2 = load <2 x double>, <2 x double>* %B
 733         %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 734         ret <2 x double> %tmp3
 735 }
 736
 737 declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
 738 declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
 739 declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 740
 741 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 742 ; CHECK-LABEL: uaddl_duprhs
 743 ; CHECK-NOT: ext.16b
 744 ; CHECK: uaddl.2d
 745   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
 746   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 747
 748   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
 749
 750   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
 751   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
 752
 753   %res = add <2 x i64> %lhs.ext, %rhs.ext
 754   ret <2 x i64> %res
 755 }
 756
 757 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 758 ; CHECK-LABEL: uaddl2_duprhs
 759 ; CHECK-NOT: ext.16b
 760 ; CHECK: uaddl2.2d
 761   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
 762   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 763
 764   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 765
 766   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
 767   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
 768
 769   %res = add <2 x i64> %lhs.ext, %rhs.ext
 770   ret <2 x i64> %res
 771 }
 772
 773 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 774 ; CHECK-LABEL: saddl_duplhs
 775 ; CHECK-NOT: ext.16b
 776 ; CHECK: saddl.2d
 777   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
 778   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 779
 780   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
 781
 782   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
 783   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
 784
 785   %res = add <2 x i64> %lhs.ext, %rhs.ext
 786   ret <2 x i64> %res
 787 }
 788
 789 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 790 ; CHECK-LABEL: saddl2_duplhs
 791 ; CHECK-NOT: ext.16b
 792 ; CHECK: saddl2.2d
 793   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
 794   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 795
 796   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 797
 798   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
 799   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
 800
 801   %res = add <2 x i64> %lhs.ext, %rhs.ext
 802   ret <2 x i64> %res
 803 }
 804
 805 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 806 ; CHECK-LABEL: usubl_duprhs
 807 ; CHECK-NOT: ext.16b
 808 ; CHECK: usubl.2d
 809   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
 810   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 811
 812   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
 813
 814   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
 815   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
 816
 817   %res = sub <2 x i64> %lhs.ext, %rhs.ext
 818   ret <2 x i64> %res
 819 }
 820
 821 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 822 ; CHECK-LABEL: usubl2_duprhs
 823 ; CHECK-NOT: ext.16b
 824 ; CHECK: usubl2.2d
 825   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
 826   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 827
 828   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 829
 830   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
 831   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
 832
 833   %res = sub <2 x i64> %lhs.ext, %rhs.ext
 834   ret <2 x i64> %res
 835 }
 836
 837 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 838 ; CHECK-LABEL: ssubl_duplhs:
 839 ; CHECK-NOT: ext.16b
 840 ; CHECK: ssubl.2d
 841   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
 842   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 843
 844   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
 845
 846   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
 847   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
 848
 849   %res = sub <2 x i64> %lhs.ext, %rhs.ext
 850   ret <2 x i64> %res
 851 }
 852
 853 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 854 ; CHECK-LABEL: ssubl2_duplhs:
 855 ; CHECK-NOT: ext.16b
 856 ; CHECK: ssubl2.2d
 857   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
 858   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 859
 860   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 861
 862   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
 863   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
 864
 865   %res = sub <2 x i64> %lhs.ext, %rhs.ext
 866   ret <2 x i64> %res
 867 }
 868
 869 define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 870 ;CHECK-LABEL: addhn8b_natural:
 871 ;CHECK: addhn.8b
 872         %tmp1 = load <8 x i16>, <8 x i16>* %A
 873         %tmp2 = load <8 x i16>, <8 x i16>* %B
 874         %sum = add <8 x i16> %tmp1, %tmp2
 875         %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 876         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
 877         ret <8 x i8> %narrowed
 878 }
 879
 880 define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 881 ;CHECK-LABEL: addhn4h_natural:
 882 ;CHECK: addhn.4h
 883         %tmp1 = load <4 x i32>, <4 x i32>* %A
 884         %tmp2 = load <4 x i32>, <4 x i32>* %B
 885         %sum = add <4 x i32> %tmp1, %tmp2
 886         %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
 887         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
 888         ret <4 x i16> %narrowed
 889 }
 890
 891 define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 892 ;CHECK-LABEL: addhn2s_natural:
 893 ;CHECK: addhn.2s
 894         %tmp1 = load <2 x i64>, <2 x i64>* %A
 895         %tmp2 = load <2 x i64>, <2 x i64>* %B
 896         %sum = add <2 x i64> %tmp1, %tmp2
 897         %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
 898         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
 899         ret <2 x i32> %narrowed
 900 }
 901
 902 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
 903 ;CHECK-LABEL: addhn2_16b_natural:
 904 ;CHECK: addhn2.16b
 905         %tmp1 = load <8 x i16>, <8 x i16>* %A
 906         %tmp2 = load <8 x i16>, <8 x i16>* %B
 907         %sum = add <8 x i16> %tmp1, %tmp2
 908         %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 909         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
 910         %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 911         ret <16 x i8> %res
 912 }
 913
 914 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
 915 ;CHECK-LABEL: addhn2_8h_natural:
 916 ;CHECK: addhn2.8h
 917         %tmp1 = load <4 x i32>, <4 x i32>* %A
 918         %tmp2 = load <4 x i32>, <4 x i32>* %B
 919         %sum = add <4 x i32> %tmp1, %tmp2
 920         %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
 921         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
 922         %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 923         ret <8 x i16> %res
 924 }
 925
 926 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
 927 ;CHECK-LABEL: addhn2_4s_natural:
 928 ;CHECK: addhn2.4s
 929         %tmp1 = load <2 x i64>, <2 x i64>* %A
 930         %tmp2 = load <2 x i64>, <2 x i64>* %B
 931         %sum = add <2 x i64> %tmp1, %tmp2
 932         %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
 933         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
 934         %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 935         ret <4 x i32> %res
 936 }
 937
 938 define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 939 ;CHECK-LABEL: subhn8b_natural:
 940 ;CHECK: subhn.8b
 941         %tmp1 = load <8 x i16>, <8 x i16>* %A
 942         %tmp2 = load <8 x i16>, <8 x i16>* %B
 943         %diff = sub <8 x i16> %tmp1, %tmp2
 944         %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 945         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
 946         ret <8 x i8> %narrowed
 947 }
 948
 949 define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 950 ;CHECK-LABEL: subhn4h_natural:
 951 ;CHECK: subhn.4h
 952         %tmp1 = load <4 x i32>, <4 x i32>* %A
 953         %tmp2 = load <4 x i32>, <4 x i32>* %B
 954         %diff = sub <4 x i32> %tmp1, %tmp2
 955         %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
 956         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
 957         ret <4 x i16> %narrowed
 958 }
 959
 960 define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 961 ;CHECK-LABEL: subhn2s_natural:
 962 ;CHECK: subhn.2s
 963         %tmp1 = load <2 x i64>, <2 x i64>* %A
 964         %tmp2 = load <2 x i64>, <2 x i64>* %B
 965         %diff = sub <2 x i64> %tmp1, %tmp2
 966         %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
 967         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
 968         ret <2 x i32> %narrowed
 969 }
 970
 971 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
 972 ;CHECK-LABEL: subhn2_16b_natural:
 973 ;CHECK: subhn2.16b
 974         %tmp1 = load <8 x i16>, <8 x i16>* %A
 975         %tmp2 = load <8 x i16>, <8 x i16>* %B
 976         %diff = sub <8 x i16> %tmp1, %tmp2
 977         %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 978         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
 979         %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 980         ret <16 x i8> %res
 981 }
 982
 983 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
 984 ;CHECK-LABEL: subhn2_8h_natural:
 985 ;CHECK: subhn2.8h
 986         %tmp1 = load <4 x i32>, <4 x i32>* %A
 987         %tmp2 = load <4 x i32>, <4 x i32>* %B
 988         %diff = sub <4 x i32> %tmp1, %tmp2
 989         %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
 990         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
 991         %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 992         ret <8 x i16> %res
 993 }
 994
 995 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
 996 ;CHECK-LABEL: subhn2_4s_natural:
 997 ;CHECK: subhn2.4s
 998         %tmp1 = load <2 x i64>, <2 x i64>* %A
 999         %tmp2 = load <2 x i64>, <2 x i64>* %B
1000         %diff = sub <2 x i64> %tmp1, %tmp2
1001         %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1002         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1003         %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1004         ret <4 x i32> %res
1005 }