1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
3 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
4 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
5 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
6 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
8 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
9 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
10 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
11 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
12 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
13 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
15 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
16 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
17 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
18 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
19 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
20 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
22 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
23 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
24 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
25 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
26 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
27 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
29 ;-----------------------------------------------------------------------------
31 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
33 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
34 ; CHECK-LABEL: test_sqrdmlah_v4i16:
35 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
36 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
37 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
38 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h
39 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2
43 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
44 ; CHECK-LABEL: test_sqrdmlah_v8i16:
45 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
46 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
47 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
48 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h
49 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
53 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
54 ; CHECK-LABEL: test_sqrdmlah_v2i32:
55 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
56 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
57 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
58 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s
59 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2
63 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
64 ; CHECK-LABEL: test_sqrdmlah_v4i32:
65 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
66 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
67 ; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s
68 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s
69 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2
73 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
74 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
75 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
76 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
77 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
78 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h
79 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2
83 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
84 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
85 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
86 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
87 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
88 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h
89 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2
93 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
94 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
95 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
96 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
97 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
98 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s
99 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2
100 ret <2 x i32> %retval
103 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
104 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
105 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
106 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
107 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s
108 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s
109 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2
110 ret <4 x i32> %retval
113 ;-----------------------------------------------------------------------------
114 ; RDMA Vector, by element
115 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
117 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
118 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
120 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
121 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
122 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
123 ; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3]
124 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3]
125 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3]
126 ret <4 x i16> %retval
129 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
130 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
132 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
133 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
134 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
135 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
136 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2]
137 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2]
138 ret <8 x i16> %retval
141 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
142 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
144 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
145 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
146 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
147 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
148 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1]
149 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1]
150 ret <2 x i32> %retval
153 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
154 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
156 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
157 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
158 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
159 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
160 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0]
161 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0]
162 ret <4 x i32> %retval
165 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
166 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
168 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
169 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
170 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
171 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3]
172 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3]
173 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3]
174 ret <4 x i16> %retval
177 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
178 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
180 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
181 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
182 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
183 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
184 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2]
185 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2]
186 ret <8 x i16> %retval
189 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
190 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
192 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
193 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
194 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
195 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
196 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1]
197 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1]
198 ret <2 x i32> %retval
201 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
202 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
204 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
205 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
206 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
207 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
208 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0]
209 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0]
210 ret <4 x i32> %retval
213 ;-----------------------------------------------------------------------------
214 ; RDMA Vector, by element, extracted
215 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
216 ; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
218 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
219 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
221 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
222 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
223 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
224 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
225 %retval = extractelement <4 x i16> %retval_vec, i64 0
226 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
227 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1]
228 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1]
232 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
233 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
235 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
236 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
237 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
238 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
239 %retval = extractelement <8 x i16> %retval_vec, i64 0
240 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
241 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1]
242 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1]
246 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
247 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
249 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
250 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
251 %extract = extractelement <2 x i32> %prod, i64 0
252 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
253 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
254 ; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0]
255 ; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0]
259 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
260 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
262 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
263 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
264 %extract = extractelement <4 x i32> %prod, i64 0
265 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
266 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
267 ; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0]
268 ; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0]
272 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
273 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
275 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
276 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
277 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
278 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
279 %retval = extractelement <4 x i16> %retval_vec, i64 0
280 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
281 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1]
282 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1]
286 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
287 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
289 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
290 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
291 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
292 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
293 %retval = extractelement <8 x i16> %retval_vec, i64 0
294 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
295 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1]
296 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1]
300 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
301 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
303 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
304 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
305 %extract = extractelement <2 x i32> %prod, i64 0
306 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
307 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
308 ; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0]
309 ; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0]
313 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
314 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
316 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
317 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
318 %extract = extractelement <4 x i32> %prod, i64 0
319 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
320 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
321 ; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0]
322 ; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0]
326 ;-----------------------------------------------------------------------------
328 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
330 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
331 ; CHECK-LABEL: test_sqrdmlah_v1i16:
332 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
333 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
334 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
335 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
336 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
337 %retval = extractelement <4 x i16> %retval_vec, i64 0
338 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
339 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
340 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
344 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
345 ; CHECK-LABEL: test_sqrdmlah_v1i32:
346 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
347 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
348 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
349 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
350 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
351 %retval = extractelement <4 x i32> %retval_vec, i64 0
352 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
353 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
354 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
359 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
360 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
361 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
362 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
363 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
364 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
365 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
366 %retval = extractelement <4 x i16> %retval_vec, i64 0
367 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
368 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
369 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
373 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
374 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
375 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
376 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
377 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
378 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
379 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
380 %retval = extractelement <4 x i32> %retval_vec, i64 0
381 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
382 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
383 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
386 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
387 ; CHECK-LABEL: test_sqrdmlah_i32:
388 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
389 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
390 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
391 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
392 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
396 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
397 ; CHECK-LABEL: test_sqrdmlsh_i32:
398 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
399 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
400 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
401 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
402 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
406 ;-----------------------------------------------------------------------------
407 ; RDMA Scalar, by element
408 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
409 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
411 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
412 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
413 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
414 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
415 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
416 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
417 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
418 %retval = extractelement <4 x i16> %retval_vec, i32 0
419 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
420 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
421 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
425 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
426 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
427 %extract = extractelement <4 x i32> %rhs, i32 3
428 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
429 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
430 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
431 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
432 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
436 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
437 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
438 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
439 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
440 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
441 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
442 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
443 %retval = extractelement <8 x i16> %retval_vec, i32 0
444 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
445 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
446 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
450 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
451 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
452 %extract = extractelement <4 x i32> %rhs, i32 3
453 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
454 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
455 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
456 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
457 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]