test/CodeGen/AArch64/arm64-neon-v8.1a.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
   2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
   3 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
   4 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
   5 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
   6 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
   7
   8 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
   9 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
  10 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
  11 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
  12 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
  13 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
  14
  15 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
  16 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
  17 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
  18 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
  19 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
  20 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
  21
  22 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
  23 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
  24 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
  25 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
  26 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
  27 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
  28
  29 ;-----------------------------------------------------------------------------
  30 ; RDMA Vector
  31 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
  32
  33 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
  34 ; CHECK-LABEL: test_sqrdmlah_v4i16:
  35    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
  36    %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
  37 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
  38 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
  39 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
  40    ret <4 x i16> %retval
  41 }
  42
  43 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
  44 ; CHECK-LABEL: test_sqrdmlah_v8i16:
  45    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
  46    %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
  47 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
  48 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
  49 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
  50    ret <8 x i16> %retval
  51 }
  52
  53 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
  54 ; CHECK-LABEL: test_sqrdmlah_v2i32:
  55    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
  56    %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
  57 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
  58 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
  59 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
  60    ret <2 x i32> %retval
  61 }
  62
  63 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
  64 ; CHECK-LABEL: test_sqrdmlah_v4i32:
  65    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
  66    %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
  67 ; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
  68 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
  69 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
  70    ret <4 x i32> %retval
  71 }
  72
  73 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
  74 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
  75    %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
  76    %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
  77 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
  78 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
  79 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
  80    ret <4 x i16> %retval
  81 }
  82
  83 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
  84 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
  85    %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
  86    %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
  87 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
  88 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
  89 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
  90    ret <8 x i16> %retval
  91 }
  92
  93 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
  94 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
  95    %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
  96    %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
  97 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
  98 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
  99 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
 100    ret <2 x i32> %retval
 101 }
 102
 103 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 104 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
 105    %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
 106    %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 107 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
 108 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
 109 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
 110    ret <4 x i32> %retval
 111 }
 112
 113 ;-----------------------------------------------------------------------------
 114 ; RDMA Vector, by element
 115 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
 116
 117 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 118 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
 119 entry:
 120   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 121   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 122   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 123 ; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
 124 ; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
 125 ; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
 126   ret <4 x i16> %retval
 127 }
 128
 129 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 130 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
 131 entry:
 132   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 133   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 134   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 135 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
 136 ; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
 137 ; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
 138   ret <8 x i16> %retval
 139 }
 140
 141 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 142 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
 143 entry:
 144   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 145   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 146   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 147 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
 148 ; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
 149 ; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
 150   ret <2 x i32> %retval
 151 }
 152
 153 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 154 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
 155 entry:
 156   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 157   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 158   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 159 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
 160 ; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
 161 ; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
 162   ret <4 x i32> %retval
 163 }
 164
 165 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
 166 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
 167 entry:
 168   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 169   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 170   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 171 ; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
 172 ; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
 173 ; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
 174   ret <4 x i16> %retval
 175 }
 176
 177 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
 178 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
 179 entry:
 180   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
 181   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 182   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 183 ; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
 184 ; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
 185 ; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
 186   ret <8 x i16> %retval
 187 }
 188
 189 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
 190 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
 191 entry:
 192   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 193   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 194   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 195 ; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
 196 ; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
 197 ; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
 198   ret <2 x i32> %retval
 199 }
 200
 201 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
 202 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
 203 entry:
 204   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 205   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 206   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 207 ; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
 208 ; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
 209 ; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
 210   ret <4 x i32> %retval
 211 }
 212
 213 ;-----------------------------------------------------------------------------
 214 ; RDMA Vector, by element, extracted
 215 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
 216 ; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
 217
 218 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 219 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
 220 entry:
 221   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 222   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 223   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 224   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 225   %retval = extractelement <4 x i16> %retval_vec, i64 0
 226 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
 227 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
 228 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
 229   ret i16 %retval
 230 }
 231
 232 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 233 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
 234 entry:
 235   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
 236   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 237   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 238   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 239   %retval = extractelement <8 x i16> %retval_vec, i64 0
 240 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
 241 ; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
 242 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
 243   ret i16 %retval
 244 }
 245
 246 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 247 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
 248 entry:
 249   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
 250   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 251   %extract = extractelement <2 x i32> %prod, i64 0
 252   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
 253 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
 254 ; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
 255 ; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
 256   ret i32 %retval
 257 }
 258
 259 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 260 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
 261 entry:
 262   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 263   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 264   %extract = extractelement <4 x i32> %prod, i64 0
 265   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
 266 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
 267 ; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
 268 ; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
 269   ret i32 %retval
 270 }
 271
 272 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
 273 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
 274 entry:
 275   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 276   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
 277   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 278   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 279   %retval = extractelement <4 x i16> %retval_vec, i64 0
 280 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
 281 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
 282 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
 283   ret i16 %retval
 284 }
 285
 286 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
 287 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
 288 entry:
 289   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
 290   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
 291   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 292   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 293   %retval = extractelement <8 x i16> %retval_vec, i64 0
 294 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
 295 ; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
 296 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
 297   ret i16 %retval
 298 }
 299
 300 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
 301 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
 302 entry:
 303   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
 304   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
 305   %extract = extractelement <2 x i32> %prod, i64 0
 306   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
 307 ; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
 308 ; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
 309 ; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
 310   ret i32 %retval
 311 }
 312
 313 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
 314 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
 315 entry:
 316   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
 317   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
 318   %extract = extractelement <4 x i32> %prod, i64 0
 319   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
 320 ; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
 321 ; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
 322 ; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
 323   ret i32 %retval
 324 }
 325
 326 ;-----------------------------------------------------------------------------
 327 ; RDMA Scalar
 328 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
 329
 330 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
 331 ; CHECK-LABEL: test_sqrdmlah_v1i16:
 332   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 333   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
 334   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
 335   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 336   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
 337   %retval = extractelement <4 x i16> %retval_vec, i64 0
 338 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 339 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 340 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 341   ret i16 %retval
 342 }
 343
 344 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
 345 ; CHECK-LABEL: test_sqrdmlah_v1i32:
 346   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
 347   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
 348   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
 349   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
 350   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
 351   %retval = extractelement <4 x i32> %retval_vec, i64 0
 352 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 353 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 354 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 355   ret i32 %retval
 356 }
 357
 358
 359 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
 360 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
 361   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 362   %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
 363   %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
 364   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 365   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
 366   %retval = extractelement <4 x i16> %retval_vec, i64 0
 367 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 368 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 369 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 370   ret i16 %retval
 371 }
 372
 373 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
 374 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
 375   %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
 376   %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
 377   %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
 378   %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
 379   %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
 380   %retval = extractelement <4 x i32> %retval_vec, i64 0
 381 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 382 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 383 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
 384   ret i32 %retval
 385 }
 386 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 387 ; CHECK-LABEL: test_sqrdmlah_i32:
 388   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
 389   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
 390 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 391 ; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 392 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 393   ret i32 %retval
 394 }
 395
 396 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
 397 ; CHECK-LABEL: test_sqrdmlsh_i32:
 398   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
 399   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
 400 ; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 401 ; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 402 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 403   ret i32 %retval
 404 }
 405
 406 ;-----------------------------------------------------------------------------
 407 ; RDMA Scalar, by element
 408 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
 409 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
 410
 411 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
 412 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
 413   %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
 414   %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
 415   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
 416   %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
 417   %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
 418   %retval = extractelement <4 x i16> %retval_vec, i32 0
 419 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
 420 ; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
 421 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
 422   ret i16 %retval
 423 }
 424
 425 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 426 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
 427   %extract = extractelement <4 x i32> %rhs, i32 3
 428   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
 429   %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
 430 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 431 ; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 432 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
 433   ret i32 %retval
 434 }
 435
 436 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
 437 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
 438   %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
 439   %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
 440   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
 441   %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
 442   %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
 443   %retval = extractelement <8 x i16> %retval_vec, i32 0
 444 ; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
 445 ; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
 446 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
 447   ret i16 %retval
 448 }
 449
 450 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 451 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
 452   %extract = extractelement <4 x i32> %rhs, i32 3
 453   %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
 454   %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
 455 ; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 456 ; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
 457 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
 458   ret i32 %retval
 459 }