1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm | FileCheck %s
3 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a | FileCheck %s
5 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
6 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
7 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
8 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
9 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
11 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
12 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
13 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
14 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
15 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
17 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
18 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
19 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
20 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
21 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
23 declare <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
24 declare <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
25 declare <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
26 declare <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
27 declare i32 @llvm.aarch64.neon.sqrdmlah.i32(i32, i32, i32)
29 declare <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
30 declare <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
31 declare <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
32 declare <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
33 declare i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32, i32, i32)
35 ; The sadd intrinsics in this file previously transformed into sqrdmlah where they
36 ; shouldn't. They should produce sqrdmulh and sqadd.
38 ;-----------------------------------------------------------------------------
40 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS
42 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
43 ; CHECK-LABEL: test_sqrdmlah_v4i16:
45 ; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h
46 ; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
48 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
49 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
53 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
54 ; CHECK-LABEL: test_sqrdmlah_v8i16:
56 ; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h
57 ; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h
59 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
60 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
64 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
65 ; CHECK-LABEL: test_sqrdmlah_v2i32:
67 ; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s
68 ; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s
70 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
71 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
75 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
76 ; CHECK-LABEL: test_sqrdmlah_v4i32:
78 ; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s
79 ; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s
81 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
82 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
86 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
87 ; CHECK-LABEL: test_sqrdmlsh_v4i16:
89 ; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h
90 ; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
92 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
93 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
97 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
98 ; CHECK-LABEL: test_sqrdmlsh_v8i16:
100 ; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h
101 ; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h
103 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
104 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
105 ret <8 x i16> %retval
108 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
109 ; CHECK-LABEL: test_sqrdmlsh_v2i32:
111 ; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s
112 ; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s
114 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
115 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
116 ret <2 x i32> %retval
119 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
120 ; CHECK-LABEL: test_sqrdmlsh_v4i32:
122 ; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s
123 ; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s
125 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
126 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
127 ret <4 x i32> %retval
130 ;-----------------------------------------------------------------------------
131 ; RDMA Vector, by element
132 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
134 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
135 ; CHECK-LABEL: test_sqrdmlah_lane_s16:
136 ; CHECK: // %bb.0: // %entry
137 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
138 ; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3]
139 ; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
142 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
143 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
144 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
145 ret <4 x i16> %retval
148 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
149 ; CHECK-LABEL: test_sqrdmlahq_lane_s16:
150 ; CHECK: // %bb.0: // %entry
151 ; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2]
152 ; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h
155 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
156 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
157 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
158 ret <8 x i16> %retval
161 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
162 ; CHECK-LABEL: test_sqrdmlah_lane_s32:
163 ; CHECK: // %bb.0: // %entry
164 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
165 ; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1]
166 ; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s
169 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
170 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
171 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
172 ret <2 x i32> %retval
175 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
176 ; CHECK-LABEL: test_sqrdmlahq_lane_s32:
177 ; CHECK: // %bb.0: // %entry
178 ; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0]
179 ; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s
182 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
183 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
184 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
185 ret <4 x i32> %retval
188 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
189 ; CHECK-LABEL: test_sqrdmlsh_lane_s16:
190 ; CHECK: // %bb.0: // %entry
191 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
192 ; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3]
193 ; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
196 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
197 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
198 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
199 ret <4 x i16> %retval
202 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
203 ; CHECK-LABEL: test_sqrdmlshq_lane_s16:
204 ; CHECK: // %bb.0: // %entry
205 ; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2]
206 ; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h
209 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
210 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
211 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
212 ret <8 x i16> %retval
215 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
216 ; CHECK-LABEL: test_sqrdmlsh_lane_s32:
217 ; CHECK: // %bb.0: // %entry
218 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
219 ; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1]
220 ; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s
223 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
224 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
225 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
226 ret <2 x i32> %retval
229 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
230 ; CHECK-LABEL: test_sqrdmlshq_lane_s32:
231 ; CHECK: // %bb.0: // %entry
232 ; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0]
233 ; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s
236 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
237 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
238 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
239 ret <4 x i32> %retval
242 ;-----------------------------------------------------------------------------
243 ; RDMA Vector, by element, extracted
244 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
245 ; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
247 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
248 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
249 ; CHECK: // %bb.0: // %entry
250 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
251 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1]
252 ; CHECK-NEXT: fmov s1, w0
253 ; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
254 ; CHECK-NEXT: umov w0, v0.h[0]
257 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
258 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
259 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
260 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
261 %retval = extractelement <4 x i16> %retval_vec, i64 0
265 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
266 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
267 ; CHECK: // %bb.0: // %entry
268 ; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1]
269 ; CHECK-NEXT: fmov s1, w0
270 ; CHECK-NEXT: sqadd v0.8h, v1.8h, v0.8h
271 ; CHECK-NEXT: umov w0, v0.h[0]
274 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
275 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
276 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
277 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
278 %retval = extractelement <8 x i16> %retval_vec, i64 0
282 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
283 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
284 ; CHECK: // %bb.0: // %entry
285 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
286 ; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0]
287 ; CHECK-NEXT: fmov s1, w0
288 ; CHECK-NEXT: sqadd s0, s1, s0
289 ; CHECK-NEXT: fmov w0, s0
292 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
293 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
294 %extract = extractelement <2 x i32> %prod, i64 0
295 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
299 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
300 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
301 ; CHECK: // %bb.0: // %entry
302 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0]
303 ; CHECK-NEXT: fmov s1, w0
304 ; CHECK-NEXT: sqadd s0, s1, s0
305 ; CHECK-NEXT: fmov w0, s0
308 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
309 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
310 %extract = extractelement <4 x i32> %prod, i64 0
311 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
315 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
316 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
317 ; CHECK: // %bb.0: // %entry
318 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
319 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1]
320 ; CHECK-NEXT: fmov s1, w0
321 ; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
322 ; CHECK-NEXT: umov w0, v0.h[0]
325 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
326 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
327 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
328 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
329 %retval = extractelement <4 x i16> %retval_vec, i64 0
333 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
334 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
335 ; CHECK: // %bb.0: // %entry
336 ; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1]
337 ; CHECK-NEXT: fmov s1, w0
338 ; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h
339 ; CHECK-NEXT: umov w0, v0.h[0]
342 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
343 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
344 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
345 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
346 %retval = extractelement <8 x i16> %retval_vec, i64 0
350 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
351 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
352 ; CHECK: // %bb.0: // %entry
353 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
354 ; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0]
355 ; CHECK-NEXT: fmov s1, w0
356 ; CHECK-NEXT: sqsub s0, s1, s0
357 ; CHECK-NEXT: fmov w0, s0
360 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
361 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
362 %extract = extractelement <2 x i32> %prod, i64 0
363 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
367 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
368 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
369 ; CHECK: // %bb.0: // %entry
370 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0]
371 ; CHECK-NEXT: fmov s1, w0
372 ; CHECK-NEXT: sqsub s0, s1, s0
373 ; CHECK-NEXT: fmov w0, s0
376 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
377 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
378 %extract = extractelement <4 x i32> %prod, i64 0
379 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
383 ;-----------------------------------------------------------------------------
385 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
387 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
388 ; CHECK-LABEL: test_sqrdmlah_v1i16:
390 ; CHECK-NEXT: fmov s0, w1
391 ; CHECK-NEXT: fmov s1, w2
392 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h
393 ; CHECK-NEXT: fmov s1, w0
394 ; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
395 ; CHECK-NEXT: umov w0, v0.h[0]
397 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
398 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
399 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
400 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
401 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
402 %retval = extractelement <4 x i16> %retval_vec, i64 0
406 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
407 ; CHECK-LABEL: test_sqrdmlah_v1i32:
409 ; CHECK-NEXT: fmov s0, w1
410 ; CHECK-NEXT: fmov s1, w2
411 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s
412 ; CHECK-NEXT: fmov s1, w0
413 ; CHECK-NEXT: sqadd v0.4s, v1.4s, v0.4s
414 ; CHECK-NEXT: fmov w0, s0
416 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
417 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
418 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
419 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
420 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
421 %retval = extractelement <4 x i32> %retval_vec, i64 0
426 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
427 ; CHECK-LABEL: test_sqrdmlsh_v1i16:
429 ; CHECK-NEXT: fmov s0, w1
430 ; CHECK-NEXT: fmov s1, w2
431 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h
432 ; CHECK-NEXT: fmov s1, w0
433 ; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
434 ; CHECK-NEXT: umov w0, v0.h[0]
436 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
437 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
438 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
439 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
440 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
441 %retval = extractelement <4 x i16> %retval_vec, i64 0
445 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
446 ; CHECK-LABEL: test_sqrdmlsh_v1i32:
448 ; CHECK-NEXT: fmov s0, w1
449 ; CHECK-NEXT: fmov s1, w2
450 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s
451 ; CHECK-NEXT: fmov s1, w0
452 ; CHECK-NEXT: sqsub v0.4s, v1.4s, v0.4s
453 ; CHECK-NEXT: fmov w0, s0
455 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
456 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
457 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
458 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
459 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
460 %retval = extractelement <4 x i32> %retval_vec, i64 0
464 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
465 ; CHECK-LABEL: test_sqrdmlah_i32:
467 ; CHECK-NEXT: fmov s0, w1
468 ; CHECK-NEXT: fmov s1, w2
469 ; CHECK-NEXT: sqrdmulh s0, s0, s1
470 ; CHECK-NEXT: fmov s1, w0
471 ; CHECK-NEXT: sqadd s0, s1, s0
472 ; CHECK-NEXT: fmov w0, s0
474 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
475 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
479 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
480 ; CHECK-LABEL: test_sqrdmlsh_i32:
482 ; CHECK-NEXT: fmov s0, w1
483 ; CHECK-NEXT: fmov s1, w2
484 ; CHECK-NEXT: sqrdmulh s0, s0, s1
485 ; CHECK-NEXT: fmov s1, w0
486 ; CHECK-NEXT: sqsub s0, s1, s0
487 ; CHECK-NEXT: fmov w0, s0
489 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
490 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
494 ;-----------------------------------------------------------------------------
495 ; RDMA Scalar, by element
496 ; i16 tests are performed via tests in above chapter, with IR in ACLE style
497 ; i32 tests are for v1i32_indexed in SIMDIndexedSQRDMLxHSDTied
499 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
500 ; CHECK-LABEL: test_sqrdmlah_extract_i16:
502 ; CHECK-NEXT: fmov s1, w1
503 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
504 ; CHECK-NEXT: sqrdmulh v0.4h, v1.4h, v0.h[1]
505 ; CHECK-NEXT: fmov s1, w0
506 ; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
507 ; CHECK-NEXT: umov w0, v0.h[0]
509 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
510 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
511 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
512 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
513 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
514 %retval = extractelement <4 x i16> %retval_vec, i32 0
518 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
519 ; CHECK-LABEL: test_sqrdmlah_extract_i32:
521 ; CHECK-NEXT: fmov s1, w1
522 ; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3]
523 ; CHECK-NEXT: fmov s1, w0
524 ; CHECK-NEXT: sqadd s0, s1, s0
525 ; CHECK-NEXT: fmov w0, s0
527 %extract = extractelement <4 x i32> %rhs, i32 3
528 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
529 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
533 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
534 ; CHECK-LABEL: test_sqrdmlshq_extract_i16:
536 ; CHECK-NEXT: fmov s1, w1
537 ; CHECK-NEXT: sqrdmulh v0.8h, v1.8h, v0.h[1]
538 ; CHECK-NEXT: fmov s1, w0
539 ; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h
540 ; CHECK-NEXT: umov w0, v0.h[0]
542 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
543 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
544 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
545 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
546 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
547 %retval = extractelement <8 x i16> %retval_vec, i32 0
551 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
552 ; CHECK-LABEL: test_sqrdmlsh_extract_i32:
554 ; CHECK-NEXT: fmov s1, w1
555 ; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3]
556 ; CHECK-NEXT: fmov s1, w0
557 ; CHECK-NEXT: sqsub s0, s1, s0
558 ; CHECK-NEXT: fmov w0, s0
560 %extract = extractelement <4 x i32> %rhs, i32 3
561 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
562 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
567 ;-----------------------------------------------------------------------------
568 ; Using sqrdmlah intrinsics
570 define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
571 ; CHECK-LABEL: test_vqrdmlah_laneq_s16:
572 ; CHECK: // %bb.0: // %entry
573 ; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7]
576 %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
577 %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
578 ret <4 x i16> %vqrdmlah_v3.i
581 define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
582 ; CHECK-LABEL: test_vqrdmlah_laneq_s32:
583 ; CHECK: // %bb.0: // %entry
584 ; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3]
587 %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
588 %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
589 ret <2 x i32> %vqrdmlah_v3.i
592 define <8 x i16> @test_vqrdmlahq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
593 ; CHECK-LABEL: test_vqrdmlahq_laneq_s16:
594 ; CHECK: // %bb.0: // %entry
595 ; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.h[7]
598 %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
599 %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
600 ret <8 x i16> %vqrdmlahq_v3.i
603 define <4 x i32> @test_vqrdmlahq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
604 ; CHECK-LABEL: test_vqrdmlahq_laneq_s32:
605 ; CHECK: // %bb.0: // %entry
606 ; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.s[3]
609 %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
610 %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
611 ret <4 x i32> %vqrdmlahq_v3.i
614 define i16 @test_vqrdmlahh_s16(i16 %a, i16 %b, i16 %c) {
615 ; CHECK-LABEL: test_vqrdmlahh_s16:
616 ; CHECK: // %bb.0: // %entry
617 ; CHECK-NEXT: fmov s0, w0
618 ; CHECK-NEXT: fmov s1, w1
619 ; CHECK-NEXT: fmov s2, w2
620 ; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.4h
621 ; CHECK-NEXT: umov w0, v0.h[0]
624 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
625 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
626 %2 = insertelement <4 x i16> undef, i16 %c, i64 0
627 %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
628 %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
632 define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) {
633 ; CHECK-LABEL: test_vqrdmlahs_s32:
634 ; CHECK: // %bb.0: // %entry
635 ; CHECK-NEXT: fmov s0, w0
636 ; CHECK-NEXT: fmov s1, w1
637 ; CHECK-NEXT: fmov s2, w2
638 ; CHECK-NEXT: sqrdmlah s0, s1, s2
639 ; CHECK-NEXT: fmov w0, s0
642 %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4
643 ret i32 %vqrdmlahs_s32.i
646 define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
647 ; CHECK-LABEL: test_vqrdmlahh_lane_s16:
648 ; CHECK: // %bb.0: // %entry
649 ; CHECK-NEXT: fmov s1, w0
650 ; CHECK-NEXT: fmov s2, w1
651 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
652 ; CHECK-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[3]
653 ; CHECK-NEXT: umov w0, v1.h[0]
656 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
657 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
658 %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
659 %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
660 %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
664 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
665 ; CHECK-LABEL: test_vqrdmlahs_lane_s32:
666 ; CHECK: // %bb.0: // %entry
667 ; CHECK-NEXT: fmov s1, w0
668 ; CHECK-NEXT: fmov s2, w1
669 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
670 ; CHECK-NEXT: sqrdmlah s1, s2, v0.s[1]
671 ; CHECK-NEXT: fmov w0, s1
674 %vget_lane = extractelement <2 x i32> %c, i64 1
675 %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
676 ret i32 %vqrdmlahs_s32.i
679 define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
680 ; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
681 ; CHECK: // %bb.0: // %entry
682 ; CHECK-NEXT: fmov s1, w0
683 ; CHECK-NEXT: fmov s2, w1
684 ; CHECK-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[7]
685 ; CHECK-NEXT: umov w0, v1.h[0]
688 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
689 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
690 %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
691 %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
692 %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
696 define i32 @test_vqrdmlahs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
697 ; CHECK-LABEL: test_vqrdmlahs_laneq_s32:
698 ; CHECK: // %bb.0: // %entry
699 ; CHECK-NEXT: fmov s1, w0
700 ; CHECK-NEXT: fmov s2, w1
701 ; CHECK-NEXT: sqrdmlah s1, s2, v0.s[3]
702 ; CHECK-NEXT: fmov w0, s1
705 %vgetq_lane = extractelement <4 x i32> %c, i64 3
706 %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
707 ret i32 %vqrdmlahs_s32.i
710 define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
711 ; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
712 ; CHECK: // %bb.0: // %entry
713 ; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7]
716 %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
717 %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
718 ret <4 x i16> %vqrdmlsh_v3.i
721 define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
722 ; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
723 ; CHECK: // %bb.0: // %entry
724 ; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3]
727 %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
728 %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
729 ret <2 x i32> %vqrdmlsh_v3.i
732 define <8 x i16> @test_vqrdmlshq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
733 ; CHECK-LABEL: test_vqrdmlshq_laneq_s16:
734 ; CHECK: // %bb.0: // %entry
735 ; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.h[7]
738 %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
739 %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
740 ret <8 x i16> %vqrdmlshq_v3.i
743 define <4 x i32> @test_vqrdmlshq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
744 ; CHECK-LABEL: test_vqrdmlshq_laneq_s32:
745 ; CHECK: // %bb.0: // %entry
746 ; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.s[3]
749 %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
750 %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
751 ret <4 x i32> %vqrdmlshq_v3.i
754 define i16 @test_vqrdmlshh_s16(i16 %a, i16 %b, i16 %c) {
755 ; CHECK-LABEL: test_vqrdmlshh_s16:
756 ; CHECK: // %bb.0: // %entry
757 ; CHECK-NEXT: fmov s0, w0
758 ; CHECK-NEXT: fmov s1, w1
759 ; CHECK-NEXT: fmov s2, w2
760 ; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.4h
761 ; CHECK-NEXT: umov w0, v0.h[0]
764 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
765 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
766 %2 = insertelement <4 x i16> undef, i16 %c, i64 0
767 %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
768 %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
772 define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) {
773 ; CHECK-LABEL: test_vqrdmlshs_s32:
774 ; CHECK: // %bb.0: // %entry
775 ; CHECK-NEXT: fmov s0, w0
776 ; CHECK-NEXT: fmov s1, w1
777 ; CHECK-NEXT: fmov s2, w2
778 ; CHECK-NEXT: sqrdmlsh s0, s1, s2
779 ; CHECK-NEXT: fmov w0, s0
782 %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4
783 ret i32 %vqrdmlshs_s32.i
786 define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
787 ; CHECK-LABEL: test_vqrdmlshh_lane_s16:
788 ; CHECK: // %bb.0: // %entry
789 ; CHECK-NEXT: fmov s1, w0
790 ; CHECK-NEXT: fmov s2, w1
791 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
792 ; CHECK-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[3]
793 ; CHECK-NEXT: umov w0, v1.h[0]
796 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
797 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
798 %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
799 %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
800 %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
804 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
805 ; CHECK-LABEL: test_vqrdmlshs_lane_s32:
806 ; CHECK: // %bb.0: // %entry
807 ; CHECK-NEXT: fmov s1, w0
808 ; CHECK-NEXT: fmov s2, w1
809 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
810 ; CHECK-NEXT: sqrdmlsh s1, s2, v0.s[1]
811 ; CHECK-NEXT: fmov w0, s1
814 %vget_lane = extractelement <2 x i32> %c, i64 1
815 %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
816 ret i32 %vqrdmlshs_s32.i
819 define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
820 ; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
821 ; CHECK: // %bb.0: // %entry
822 ; CHECK-NEXT: fmov s1, w0
823 ; CHECK-NEXT: fmov s2, w1
824 ; CHECK-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[7]
825 ; CHECK-NEXT: umov w0, v1.h[0]
828 %0 = insertelement <4 x i16> undef, i16 %a, i64 0
829 %1 = insertelement <4 x i16> undef, i16 %b, i64 0
830 %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
831 %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
832 %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
836 define i32 @test_vqrdmlshs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
837 ; CHECK-LABEL: test_vqrdmlshs_laneq_s32:
838 ; CHECK: // %bb.0: // %entry
839 ; CHECK-NEXT: fmov s1, w0
840 ; CHECK-NEXT: fmov s2, w1
841 ; CHECK-NEXT: sqrdmlsh s1, s2, v0.s[3]
842 ; CHECK-NEXT: fmov w0, s1
845 %vgetq_lane = extractelement <4 x i32> %c, i64 3
846 %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
847 ret i32 %vqrdmlshs_s32.i