1 ; RUN: llc < %s -asm-verbose=false -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7 %tmp1 = load <8 x i8>, <8 x i8>* %A
8 %tmp2 = load <8 x i8>, <8 x i8>* %B
9 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
13 define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
14 ;CHECK-LABEL: smull4s:
16 %tmp1 = load <4 x i16>, <4 x i16>* %A
17 %tmp2 = load <4 x i16>, <4 x i16>* %B
18 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
22 define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
23 ;CHECK-LABEL: smull2d:
25 %tmp1 = load <2 x i32>, <2 x i32>* %A
26 %tmp2 = load <2 x i32>, <2 x i32>* %B
27 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
31 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
32 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
33 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
35 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
36 ;CHECK-LABEL: umull8h:
38 %tmp1 = load <8 x i8>, <8 x i8>* %A
39 %tmp2 = load <8 x i8>, <8 x i8>* %B
40 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
44 define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
45 ;CHECK-LABEL: umull4s:
47 %tmp1 = load <4 x i16>, <4 x i16>* %A
48 %tmp2 = load <4 x i16>, <4 x i16>* %B
49 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
53 define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
54 ;CHECK-LABEL: umull2d:
56 %tmp1 = load <2 x i32>, <2 x i32>* %A
57 %tmp2 = load <2 x i32>, <2 x i32>* %B
58 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
62 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
63 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
64 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
66 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
67 ;CHECK-LABEL: sqdmull4s:
69 %tmp1 = load <4 x i16>, <4 x i16>* %A
70 %tmp2 = load <4 x i16>, <4 x i16>* %B
71 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
75 define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
76 ;CHECK-LABEL: sqdmull2d:
78 %tmp1 = load <2 x i32>, <2 x i32>* %A
79 %tmp2 = load <2 x i32>, <2 x i32>* %B
80 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
84 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
85 ;CHECK-LABEL: sqdmull2_4s:
87 %load1 = load <8 x i16>, <8 x i16>* %A
88 %load2 = load <8 x i16>, <8 x i16>* %B
89 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
90 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
91 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
95 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
96 ;CHECK-LABEL: sqdmull2_2d:
98 %load1 = load <4 x i32>, <4 x i32>* %A
99 %load2 = load <4 x i32>, <4 x i32>* %B
100 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
101 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
102 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
107 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
108 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
110 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
111 ;CHECK-LABEL: pmull8h:
113 %tmp1 = load <8 x i8>, <8 x i8>* %A
114 %tmp2 = load <8 x i8>, <8 x i8>* %B
115 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
119 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
121 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
122 ;CHECK-LABEL: sqdmulh_4h:
124 %tmp1 = load <4 x i16>, <4 x i16>* %A
125 %tmp2 = load <4 x i16>, <4 x i16>* %B
126 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
130 define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
131 ;CHECK-LABEL: sqdmulh_8h:
133 %tmp1 = load <8 x i16>, <8 x i16>* %A
134 %tmp2 = load <8 x i16>, <8 x i16>* %B
135 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
139 define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
140 ;CHECK-LABEL: sqdmulh_2s:
142 %tmp1 = load <2 x i32>, <2 x i32>* %A
143 %tmp2 = load <2 x i32>, <2 x i32>* %B
144 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
148 define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
149 ;CHECK-LABEL: sqdmulh_4s:
151 %tmp1 = load <4 x i32>, <4 x i32>* %A
152 %tmp2 = load <4 x i32>, <4 x i32>* %B
153 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
157 define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
158 ;CHECK-LABEL: sqdmulh_1s:
159 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
160 %tmp1 = load i32, i32* %A
161 %tmp2 = load i32, i32* %B
162 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
166 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
167 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
168 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
169 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
170 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
172 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
173 ;CHECK-LABEL: sqrdmulh_4h:
175 %tmp1 = load <4 x i16>, <4 x i16>* %A
176 %tmp2 = load <4 x i16>, <4 x i16>* %B
177 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
181 define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
182 ;CHECK-LABEL: sqrdmulh_8h:
184 %tmp1 = load <8 x i16>, <8 x i16>* %A
185 %tmp2 = load <8 x i16>, <8 x i16>* %B
186 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
190 define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
191 ;CHECK-LABEL: sqrdmulh_2s:
193 %tmp1 = load <2 x i32>, <2 x i32>* %A
194 %tmp2 = load <2 x i32>, <2 x i32>* %B
195 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
199 define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
200 ;CHECK-LABEL: sqrdmulh_4s:
202 %tmp1 = load <4 x i32>, <4 x i32>* %A
203 %tmp2 = load <4 x i32>, <4 x i32>* %B
204 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
208 define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
209 ;CHECK-LABEL: sqrdmulh_1s:
210 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
211 %tmp1 = load i32, i32* %A
212 %tmp2 = load i32, i32* %B
213 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
217 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
218 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
219 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
220 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
221 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
223 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
224 ;CHECK-LABEL: fmulx_2s:
226 %tmp1 = load <2 x float>, <2 x float>* %A
227 %tmp2 = load <2 x float>, <2 x float>* %B
228 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
229 ret <2 x float> %tmp3
232 define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
233 ;CHECK-LABEL: fmulx_4s:
235 %tmp1 = load <4 x float>, <4 x float>* %A
236 %tmp2 = load <4 x float>, <4 x float>* %B
237 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
238 ret <4 x float> %tmp3
241 define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
242 ;CHECK-LABEL: fmulx_2d:
244 %tmp1 = load <2 x double>, <2 x double>* %A
245 %tmp2 = load <2 x double>, <2 x double>* %B
246 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
247 ret <2 x double> %tmp3
250 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
251 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
252 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
254 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
255 ;CHECK-LABEL: smlal4s:
257 %tmp1 = load <4 x i16>, <4 x i16>* %A
258 %tmp2 = load <4 x i16>, <4 x i16>* %B
259 %tmp3 = load <4 x i32>, <4 x i32>* %C
260 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
261 %tmp5 = add <4 x i32> %tmp3, %tmp4
265 define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
266 ;CHECK-LABEL: smlal2d:
268 %tmp1 = load <2 x i32>, <2 x i32>* %A
269 %tmp2 = load <2 x i32>, <2 x i32>* %B
270 %tmp3 = load <2 x i64>, <2 x i64>* %C
271 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
272 %tmp5 = add <2 x i64> %tmp3, %tmp4
276 define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
277 ;CHECK-LABEL: smlsl4s:
279 %tmp1 = load <4 x i16>, <4 x i16>* %A
280 %tmp2 = load <4 x i16>, <4 x i16>* %B
281 %tmp3 = load <4 x i32>, <4 x i32>* %C
282 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
283 %tmp5 = sub <4 x i32> %tmp3, %tmp4
287 define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
288 ;CHECK-LABEL: smlsl2d:
290 %tmp1 = load <2 x i32>, <2 x i32>* %A
291 %tmp2 = load <2 x i32>, <2 x i32>* %B
292 %tmp3 = load <2 x i64>, <2 x i64>* %C
293 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
294 %tmp5 = sub <2 x i64> %tmp3, %tmp4
298 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
299 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
300 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
301 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
303 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
304 ;CHECK-LABEL: sqdmlal4s:
306 %tmp1 = load <4 x i16>, <4 x i16>* %A
307 %tmp2 = load <4 x i16>, <4 x i16>* %B
308 %tmp3 = load <4 x i32>, <4 x i32>* %C
309 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
310 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
314 define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
315 ;CHECK-LABEL: sqdmlal2d:
317 %tmp1 = load <2 x i32>, <2 x i32>* %A
318 %tmp2 = load <2 x i32>, <2 x i32>* %B
319 %tmp3 = load <2 x i64>, <2 x i64>* %C
320 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
321 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
325 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
326 ;CHECK-LABEL: sqdmlal2_4s:
328 %load1 = load <8 x i16>, <8 x i16>* %A
329 %load2 = load <8 x i16>, <8 x i16>* %B
330 %tmp3 = load <4 x i32>, <4 x i32>* %C
331 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
332 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
334 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
338 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
339 ;CHECK-LABEL: sqdmlal2_2d:
341 %load1 = load <4 x i32>, <4 x i32>* %A
342 %load2 = load <4 x i32>, <4 x i32>* %B
343 %tmp3 = load <2 x i64>, <2 x i64>* %C
344 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
345 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
346 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
347 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
351 define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
352 ;CHECK-LABEL: sqdmlsl4s:
354 %tmp1 = load <4 x i16>, <4 x i16>* %A
355 %tmp2 = load <4 x i16>, <4 x i16>* %B
356 %tmp3 = load <4 x i32>, <4 x i32>* %C
357 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
358 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
362 define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
363 ;CHECK-LABEL: sqdmlsl2d:
365 %tmp1 = load <2 x i32>, <2 x i32>* %A
366 %tmp2 = load <2 x i32>, <2 x i32>* %B
367 %tmp3 = load <2 x i64>, <2 x i64>* %C
368 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
369 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
373 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
374 ;CHECK-LABEL: sqdmlsl2_4s:
376 %load1 = load <8 x i16>, <8 x i16>* %A
377 %load2 = load <8 x i16>, <8 x i16>* %B
378 %tmp3 = load <4 x i32>, <4 x i32>* %C
379 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
380 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
381 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
382 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
386 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
387 ;CHECK-LABEL: sqdmlsl2_2d:
389 %load1 = load <4 x i32>, <4 x i32>* %A
390 %load2 = load <4 x i32>, <4 x i32>* %B
391 %tmp3 = load <2 x i64>, <2 x i64>* %C
392 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
393 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
394 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
395 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
399 define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
400 ;CHECK-LABEL: umlal4s:
402 %tmp1 = load <4 x i16>, <4 x i16>* %A
403 %tmp2 = load <4 x i16>, <4 x i16>* %B
404 %tmp3 = load <4 x i32>, <4 x i32>* %C
405 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
406 %tmp5 = add <4 x i32> %tmp3, %tmp4
410 define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
411 ;CHECK-LABEL: umlal2d:
413 %tmp1 = load <2 x i32>, <2 x i32>* %A
414 %tmp2 = load <2 x i32>, <2 x i32>* %B
415 %tmp3 = load <2 x i64>, <2 x i64>* %C
416 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
417 %tmp5 = add <2 x i64> %tmp3, %tmp4
421 define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
422 ;CHECK-LABEL: umlsl4s:
424 %tmp1 = load <4 x i16>, <4 x i16>* %A
425 %tmp2 = load <4 x i16>, <4 x i16>* %B
426 %tmp3 = load <4 x i32>, <4 x i32>* %C
427 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
428 %tmp5 = sub <4 x i32> %tmp3, %tmp4
432 define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
433 ;CHECK-LABEL: umlsl2d:
435 %tmp1 = load <2 x i32>, <2 x i32>* %A
436 %tmp2 = load <2 x i32>, <2 x i32>* %B
437 %tmp3 = load <2 x i64>, <2 x i64>* %C
438 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
439 %tmp5 = sub <2 x i64> %tmp3, %tmp4
443 define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
444 ;CHECK-LABEL: fmla_2s:
446 %tmp1 = load <2 x float>, <2 x float>* %A
447 %tmp2 = load <2 x float>, <2 x float>* %B
448 %tmp3 = load <2 x float>, <2 x float>* %C
449 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
450 ret <2 x float> %tmp4
453 define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
454 ;CHECK-LABEL: fmla_4s:
456 %tmp1 = load <4 x float>, <4 x float>* %A
457 %tmp2 = load <4 x float>, <4 x float>* %B
458 %tmp3 = load <4 x float>, <4 x float>* %C
459 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
460 ret <4 x float> %tmp4
463 define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
464 ;CHECK-LABEL: fmla_2d:
466 %tmp1 = load <2 x double>, <2 x double>* %A
467 %tmp2 = load <2 x double>, <2 x double>* %B
468 %tmp3 = load <2 x double>, <2 x double>* %C
469 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
470 ret <2 x double> %tmp4
473 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
474 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
475 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
477 define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
478 ;CHECK-LABEL: fmls_2s:
480 %tmp1 = load <2 x float>, <2 x float>* %A
481 %tmp2 = load <2 x float>, <2 x float>* %B
482 %tmp3 = load <2 x float>, <2 x float>* %C
483 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
484 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
485 ret <2 x float> %tmp5
488 define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
489 ;CHECK-LABEL: fmls_4s:
491 %tmp1 = load <4 x float>, <4 x float>* %A
492 %tmp2 = load <4 x float>, <4 x float>* %B
493 %tmp3 = load <4 x float>, <4 x float>* %C
494 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
495 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
496 ret <4 x float> %tmp5
499 define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
500 ;CHECK-LABEL: fmls_2d:
502 %tmp1 = load <2 x double>, <2 x double>* %A
503 %tmp2 = load <2 x double>, <2 x double>* %B
504 %tmp3 = load <2 x double>, <2 x double>* %C
505 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
506 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
507 ret <2 x double> %tmp5
510 define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
511 ;CHECK-LABEL: fmls_commuted_neg_2s:
513 %tmp1 = load <2 x float>, <2 x float>* %A
514 %tmp2 = load <2 x float>, <2 x float>* %B
515 %tmp3 = load <2 x float>, <2 x float>* %C
516 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
517 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
518 ret <2 x float> %tmp5
521 define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
522 ;CHECK-LABEL: fmls_commuted_neg_4s:
524 %tmp1 = load <4 x float>, <4 x float>* %A
525 %tmp2 = load <4 x float>, <4 x float>* %B
526 %tmp3 = load <4 x float>, <4 x float>* %C
527 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
528 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
529 ret <4 x float> %tmp5
532 define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
533 ;CHECK-LABEL: fmls_commuted_neg_2d:
535 %tmp1 = load <2 x double>, <2 x double>* %A
536 %tmp2 = load <2 x double>, <2 x double>* %B
537 %tmp3 = load <2 x double>, <2 x double>* %C
538 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
539 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
540 ret <2 x double> %tmp5
543 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
544 ;CHECK-LABEL: fmls_indexed_2s:
547 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
548 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
549 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
550 ret <2 x float> %fmls1
553 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
554 ;CHECK-LABEL: fmls_indexed_4s:
557 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
558 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
559 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
560 ret <4 x float> %fmls1
563 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
564 ;CHECK-LABEL: fmls_indexed_2d:
567 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
568 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
569 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
570 ret <2 x double> %fmls1
573 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
575 ; CHECK-LABEL: fmla_indexed_scalar_2s:
576 ; CHECK-NEXT: fmla.2s
578 %v1 = insertelement <2 x float> undef, float %c, i32 0
579 %v2 = insertelement <2 x float> %v1, float %c, i32 1
580 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
581 ret <2 x float> %fmla1
584 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
586 ; CHECK-LABEL: fmla_indexed_scalar_4s:
587 ; CHECK-NEXT: fmla.4s
589 %v1 = insertelement <4 x float> undef, float %c, i32 0
590 %v2 = insertelement <4 x float> %v1, float %c, i32 1
591 %v3 = insertelement <4 x float> %v2, float %c, i32 2
592 %v4 = insertelement <4 x float> %v3, float %c, i32 3
593 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
594 ret <4 x float> %fmla1
597 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
598 ; CHECK-LABEL: fmla_indexed_scalar_2d:
599 ; CHECK-NEXT: fmla.2d
602 %v1 = insertelement <2 x double> undef, double %c, i32 0
603 %v2 = insertelement <2 x double> %v1, double %c, i32 1
604 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
605 ret <2 x double> %fmla1
608 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
609 ;CHECK-LABEL: mul_4h:
612 %tmp1 = load <4 x i16>, <4 x i16>* %A
613 %tmp2 = load <4 x i16>, <4 x i16>* %B
614 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
615 %tmp4 = mul <4 x i16> %tmp1, %tmp3
619 define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
620 ;CHECK-LABEL: mul_8h:
623 %tmp1 = load <8 x i16>, <8 x i16>* %A
624 %tmp2 = load <8 x i16>, <8 x i16>* %B
625 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
626 %tmp4 = mul <8 x i16> %tmp1, %tmp3
630 define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
631 ;CHECK-LABEL: mul_2s:
634 %tmp1 = load <2 x i32>, <2 x i32>* %A
635 %tmp2 = load <2 x i32>, <2 x i32>* %B
636 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
637 %tmp4 = mul <2 x i32> %tmp1, %tmp3
641 define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
642 ;CHECK-LABEL: mul_4s:
645 %tmp1 = load <4 x i32>, <4 x i32>* %A
646 %tmp2 = load <4 x i32>, <4 x i32>* %B
647 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
648 %tmp4 = mul <4 x i32> %tmp1, %tmp3
652 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
653 ; CHECK-LABEL: mul_2d:
656 %tmp1 = mul <2 x i64> %A, %B
660 define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
661 ;CHECK-LABEL: fmul_lane_2s:
664 %tmp1 = load <2 x float>, <2 x float>* %A
665 %tmp2 = load <2 x float>, <2 x float>* %B
666 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
667 %tmp4 = fmul <2 x float> %tmp1, %tmp3
668 ret <2 x float> %tmp4
671 define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
672 ;CHECK-LABEL: fmul_lane_4s:
675 %tmp1 = load <4 x float>, <4 x float>* %A
676 %tmp2 = load <4 x float>, <4 x float>* %B
677 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
678 %tmp4 = fmul <4 x float> %tmp1, %tmp3
679 ret <4 x float> %tmp4
682 define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
683 ;CHECK-LABEL: fmul_lane_2d:
686 %tmp1 = load <2 x double>, <2 x double>* %A
687 %tmp2 = load <2 x double>, <2 x double>* %B
688 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
689 %tmp4 = fmul <2 x double> %tmp1, %tmp3
690 ret <2 x double> %tmp4
693 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
694 ;CHECK-LABEL: fmul_lane_s:
696 ;CHECK: fmul.s s0, s0, v1[3]
697 %B = extractelement <4 x float> %vec, i32 3
698 %res = fmul float %A, %B
702 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
703 ;CHECK-LABEL: fmul_lane_d:
705 ;CHECK: fmul.d d0, d0, v1[1]
706 %B = extractelement <2 x double> %vec, i32 1
707 %res = fmul double %A, %B
713 define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
714 ;CHECK-LABEL: fmulx_lane_2s:
717 %tmp1 = load <2 x float>, <2 x float>* %A
718 %tmp2 = load <2 x float>, <2 x float>* %B
719 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
720 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
721 ret <2 x float> %tmp4
724 define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
725 ;CHECK-LABEL: fmulx_lane_4s:
728 %tmp1 = load <4 x float>, <4 x float>* %A
729 %tmp2 = load <4 x float>, <4 x float>* %B
730 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
731 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
732 ret <4 x float> %tmp4
735 define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
736 ;CHECK-LABEL: fmulx_lane_2d:
739 %tmp1 = load <2 x double>, <2 x double>* %A
740 %tmp2 = load <2 x double>, <2 x double>* %B
741 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
742 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
743 ret <2 x double> %tmp4
746 define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
747 ;CHECK-LABEL: sqdmulh_lane_4h:
750 %tmp1 = load <4 x i16>, <4 x i16>* %A
751 %tmp2 = load <4 x i16>, <4 x i16>* %B
752 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
753 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
757 define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
758 ;CHECK-LABEL: sqdmulh_lane_8h:
761 %tmp1 = load <8 x i16>, <8 x i16>* %A
762 %tmp2 = load <8 x i16>, <8 x i16>* %B
763 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
764 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
768 define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
769 ;CHECK-LABEL: sqdmulh_lane_2s:
772 %tmp1 = load <2 x i32>, <2 x i32>* %A
773 %tmp2 = load <2 x i32>, <2 x i32>* %B
774 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
775 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
779 define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
780 ;CHECK-LABEL: sqdmulh_lane_4s:
783 %tmp1 = load <4 x i32>, <4 x i32>* %A
784 %tmp2 = load <4 x i32>, <4 x i32>* %B
785 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
786 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
790 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
791 ;CHECK-LABEL: sqdmulh_lane_1s:
793 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
794 %tmp1 = extractelement <4 x i32> %B, i32 1
795 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
799 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
800 ;CHECK-LABEL: sqrdmulh_lane_4h:
803 %tmp1 = load <4 x i16>, <4 x i16>* %A
804 %tmp2 = load <4 x i16>, <4 x i16>* %B
805 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
806 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
810 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
811 ;CHECK-LABEL: sqrdmulh_lane_8h:
814 %tmp1 = load <8 x i16>, <8 x i16>* %A
815 %tmp2 = load <8 x i16>, <8 x i16>* %B
816 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
817 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
821 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
822 ;CHECK-LABEL: sqrdmulh_lane_2s:
825 %tmp1 = load <2 x i32>, <2 x i32>* %A
826 %tmp2 = load <2 x i32>, <2 x i32>* %B
827 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
828 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
832 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
833 ;CHECK-LABEL: sqrdmulh_lane_4s:
836 %tmp1 = load <4 x i32>, <4 x i32>* %A
837 %tmp2 = load <4 x i32>, <4 x i32>* %B
838 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
839 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
843 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
844 ;CHECK-LABEL: sqrdmulh_lane_1s:
846 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
847 %tmp1 = extractelement <4 x i32> %B, i32 1
848 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
852 define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
853 ;CHECK-LABEL: sqdmull_lane_4s:
856 %tmp1 = load <4 x i16>, <4 x i16>* %A
857 %tmp2 = load <4 x i16>, <4 x i16>* %B
858 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
859 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
863 define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
864 ;CHECK-LABEL: sqdmull_lane_2d:
867 %tmp1 = load <2 x i32>, <2 x i32>* %A
868 %tmp2 = load <2 x i32>, <2 x i32>* %B
869 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
870 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
874 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
875 ;CHECK-LABEL: sqdmull2_lane_4s:
878 %load1 = load <8 x i16>, <8 x i16>* %A
879 %load2 = load <8 x i16>, <8 x i16>* %B
880 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
881 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
882 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
886 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
887 ;CHECK-LABEL: sqdmull2_lane_2d:
890 %load1 = load <4 x i32>, <4 x i32>* %A
891 %load2 = load <4 x i32>, <4 x i32>* %B
892 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
893 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
894 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
898 define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
899 ;CHECK-LABEL: umull_lane_4s:
902 %tmp1 = load <4 x i16>, <4 x i16>* %A
903 %tmp2 = load <4 x i16>, <4 x i16>* %B
904 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
905 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
909 define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
910 ;CHECK-LABEL: umull_lane_2d:
913 %tmp1 = load <2 x i32>, <2 x i32>* %A
914 %tmp2 = load <2 x i32>, <2 x i32>* %B
915 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
916 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
920 define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
921 ;CHECK-LABEL: smull_lane_4s:
924 %tmp1 = load <4 x i16>, <4 x i16>* %A
925 %tmp2 = load <4 x i16>, <4 x i16>* %B
926 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
927 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
931 define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
932 ;CHECK-LABEL: smull_lane_2d:
935 %tmp1 = load <2 x i32>, <2 x i32>* %A
936 %tmp2 = load <2 x i32>, <2 x i32>* %B
937 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
938 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
942 define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
943 ;CHECK-LABEL: smlal_lane_4s:
946 %tmp1 = load <4 x i16>, <4 x i16>* %A
947 %tmp2 = load <4 x i16>, <4 x i16>* %B
948 %tmp3 = load <4 x i32>, <4 x i32>* %C
949 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
950 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
951 %tmp6 = add <4 x i32> %tmp3, %tmp5
955 define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
956 ;CHECK-LABEL: smlal_lane_2d:
959 %tmp1 = load <2 x i32>, <2 x i32>* %A
960 %tmp2 = load <2 x i32>, <2 x i32>* %B
961 %tmp3 = load <2 x i64>, <2 x i64>* %C
962 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
963 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
964 %tmp6 = add <2 x i64> %tmp3, %tmp5
968 define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
969 ;CHECK-LABEL: sqdmlal_lane_4s:
972 %tmp1 = load <4 x i16>, <4 x i16>* %A
973 %tmp2 = load <4 x i16>, <4 x i16>* %B
974 %tmp3 = load <4 x i32>, <4 x i32>* %C
975 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
976 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
977 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
981 define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
982 ;CHECK-LABEL: sqdmlal_lane_2d:
985 %tmp1 = load <2 x i32>, <2 x i32>* %A
986 %tmp2 = load <2 x i32>, <2 x i32>* %B
987 %tmp3 = load <2 x i64>, <2 x i64>* %C
988 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
989 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
990 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
994 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
995 ;CHECK-LABEL: sqdmlal2_lane_4s:
998 %load1 = load <8 x i16>, <8 x i16>* %A
999 %load2 = load <8 x i16>, <8 x i16>* %B
1000 %tmp3 = load <4 x i32>, <4 x i32>* %C
1001 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1002 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1003 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1004 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1008 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1009 ;CHECK-LABEL: sqdmlal2_lane_2d:
1012 %load1 = load <4 x i32>, <4 x i32>* %A
1013 %load2 = load <4 x i32>, <4 x i32>* %B
1014 %tmp3 = load <2 x i64>, <2 x i64>* %C
1015 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1016 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1017 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1018 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1022 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1023 ;CHECK-LABEL: sqdmlal_lane_1s:
1025 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1026 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1027 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1028 %prod = extractelement <4 x i32> %prod.vec, i32 0
1029 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1032 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1034 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1035 ;CHECK-LABEL: sqdmlsl_lane_1s:
1037 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1038 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1039 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1040 %prod = extractelement <4 x i32> %prod.vec, i32 0
1041 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1044 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1046 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1047 ;CHECK-LABEL: sqdmlal_lane_1d:
1049 %rhs = extractelement <2 x i32> %C, i32 1
1050 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1051 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1054 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1055 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1057 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1058 ;CHECK-LABEL: sqdmlsl_lane_1d:
1060 %rhs = extractelement <2 x i32> %C, i32 1
1061 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1062 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1065 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1068 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1069 ;CHECK-LABEL: umlal_lane_4s:
1072 %tmp1 = load <4 x i16>, <4 x i16>* %A
1073 %tmp2 = load <4 x i16>, <4 x i16>* %B
1074 %tmp3 = load <4 x i32>, <4 x i32>* %C
1075 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1076 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1077 %tmp6 = add <4 x i32> %tmp3, %tmp5
1081 define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1082 ;CHECK-LABEL: umlal_lane_2d:
1085 %tmp1 = load <2 x i32>, <2 x i32>* %A
1086 %tmp2 = load <2 x i32>, <2 x i32>* %B
1087 %tmp3 = load <2 x i64>, <2 x i64>* %C
1088 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1089 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1090 %tmp6 = add <2 x i64> %tmp3, %tmp5
1095 define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1096 ;CHECK-LABEL: smlsl_lane_4s:
1099 %tmp1 = load <4 x i16>, <4 x i16>* %A
1100 %tmp2 = load <4 x i16>, <4 x i16>* %B
1101 %tmp3 = load <4 x i32>, <4 x i32>* %C
1102 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1103 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1104 %tmp6 = sub <4 x i32> %tmp3, %tmp5
1108 define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1109 ;CHECK-LABEL: smlsl_lane_2d:
1112 %tmp1 = load <2 x i32>, <2 x i32>* %A
1113 %tmp2 = load <2 x i32>, <2 x i32>* %B
1114 %tmp3 = load <2 x i64>, <2 x i64>* %C
1115 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1116 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1117 %tmp6 = sub <2 x i64> %tmp3, %tmp5
1121 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1122 ;CHECK-LABEL: sqdmlsl_lane_4s:
1125 %tmp1 = load <4 x i16>, <4 x i16>* %A
1126 %tmp2 = load <4 x i16>, <4 x i16>* %B
1127 %tmp3 = load <4 x i32>, <4 x i32>* %C
1128 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1129 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1130 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1134 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1135 ;CHECK-LABEL: sqdmlsl_lane_2d:
1138 %tmp1 = load <2 x i32>, <2 x i32>* %A
1139 %tmp2 = load <2 x i32>, <2 x i32>* %B
1140 %tmp3 = load <2 x i64>, <2 x i64>* %C
1141 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1142 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1143 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1147 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1148 ;CHECK-LABEL: sqdmlsl2_lane_4s:
1151 %load1 = load <8 x i16>, <8 x i16>* %A
1152 %load2 = load <8 x i16>, <8 x i16>* %B
1153 %tmp3 = load <4 x i32>, <4 x i32>* %C
1154 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1155 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1156 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1157 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1161 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1162 ;CHECK-LABEL: sqdmlsl2_lane_2d:
1165 %load1 = load <4 x i32>, <4 x i32>* %A
1166 %load2 = load <4 x i32>, <4 x i32>* %B
1167 %tmp3 = load <2 x i64>, <2 x i64>* %C
1168 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1169 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1170 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1171 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1175 define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1176 ;CHECK-LABEL: umlsl_lane_4s:
1179 %tmp1 = load <4 x i16>, <4 x i16>* %A
1180 %tmp2 = load <4 x i16>, <4 x i16>* %B
1181 %tmp3 = load <4 x i32>, <4 x i32>* %C
1182 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1183 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1184 %tmp6 = sub <4 x i32> %tmp3, %tmp5
1188 define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1189 ;CHECK-LABEL: umlsl_lane_2d:
1192 %tmp1 = load <2 x i32>, <2 x i32>* %A
1193 %tmp2 = load <2 x i32>, <2 x i32>* %B
1194 %tmp3 = load <2 x i64>, <2 x i64>* %C
1195 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1196 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1197 %tmp6 = sub <2 x i64> %tmp3, %tmp5
1202 define float @fmulxs(float %a, float %b) nounwind {
1203 ; CHECK-LABEL: fmulxs:
1204 ; CHECK-NEXT: fmulx s0, s0, s1
1205 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1210 define double @fmulxd(double %a, double %b) nounwind {
1211 ; CHECK-LABEL: fmulxd:
1212 ; CHECK-NEXT: fmulx d0, d0, d1
1213 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1218 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1219 ; CHECK-LABEL: fmulxs_lane:
1220 ; CHECK-NEXT: fmulx.s s0, s0, v1[3]
1221 %b = extractelement <4 x float> %vec, i32 3
1222 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1227 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1228 ; CHECK-LABEL: fmulxd_lane:
1229 ; CHECK-NEXT: fmulx.d d0, d0, v1[1]
1230 %b = extractelement <2 x double> %vec, i32 1
1231 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1236 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1237 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1240 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1241 ; CHECK-LABEL: smull2_8h_simple:
1242 ; CHECK-NEXT: smull2.8h v0, v0, v1
1244 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1245 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1246 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1250 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1251 ; CHECK-LABEL: foo0:
1252 ; CHECK: smull2.8h v0, v0, v1
1253 %tmp = bitcast <16 x i8> %a to <2 x i64>
1254 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1255 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1256 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1257 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1258 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1259 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1260 ret <8 x i16> %vmull.i.i
1263 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1264 ; CHECK-LABEL: foo1:
1265 ; CHECK: smull2.4s v0, v0, v1
1266 %tmp = bitcast <8 x i16> %a to <2 x i64>
1267 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1268 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1269 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1270 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1271 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1272 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1273 ret <4 x i32> %vmull2.i.i
1276 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1277 ; CHECK-LABEL: foo2:
1278 ; CHECK: smull2.2d v0, v0, v1
1279 %tmp = bitcast <4 x i32> %a to <2 x i64>
1280 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1281 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1282 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1283 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1284 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1285 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1286 ret <2 x i64> %vmull2.i.i
1289 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1290 ; CHECK-LABEL: foo3:
1291 ; CHECK: umull2.8h v0, v0, v1
1292 %tmp = bitcast <16 x i8> %a to <2 x i64>
1293 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1294 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1295 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1296 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1297 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1298 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1299 ret <8 x i16> %vmull.i.i
1302 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1303 ; CHECK-LABEL: foo4:
1304 ; CHECK: umull2.4s v0, v0, v1
1305 %tmp = bitcast <8 x i16> %a to <2 x i64>
1306 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1307 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1308 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1309 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1310 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1311 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1312 ret <4 x i32> %vmull2.i.i
1315 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1316 ; CHECK-LABEL: foo5:
1317 ; CHECK: umull2.2d v0, v0, v1
1318 %tmp = bitcast <4 x i32> %a to <2 x i64>
1319 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1320 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1321 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1322 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1323 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1324 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1325 ret <2 x i64> %vmull2.i.i
1328 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1329 ; CHECK-LABEL: foo6:
1330 ; CHECK-NEXT: smull2.4s v0, v1, v2[1]
1333 %0 = bitcast <8 x i16> %b to <2 x i64>
1334 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1335 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1336 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1337 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1338 ret <4 x i32> %vmull2.i
1341 define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1342 ; CHECK-LABEL: foo6a:
1343 ; CHECK-NEXT: smull.4s v0, v1, v2[1]
1346 %0 = bitcast <8 x i16> %b to <2 x i64>
1347 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1348 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1349 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1350 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1351 ret <4 x i32> %vmull2.i
1354 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1355 ; CHECK-LABEL: foo7:
1356 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
1359 %0 = bitcast <4 x i32> %b to <2 x i64>
1360 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1361 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1362 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1363 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1364 ret <2 x i64> %vmull2.i
1367 define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1368 ; CHECK-LABEL: foo7a:
1369 ; CHECK-NEXT: smull.2d v0, v1, v2[1]
1372 %0 = bitcast <4 x i32> %b to <2 x i64>
1373 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1374 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1375 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1376 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1377 ret <2 x i64> %vmull2.i
1381 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1382 ; CHECK-LABEL: foo8:
1383 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
1386 %0 = bitcast <8 x i16> %b to <2 x i64>
1387 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1388 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1389 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1390 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1391 ret <4 x i32> %vmull2.i
1394 define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1395 ; CHECK-LABEL: foo8a:
1396 ; CHECK-NEXT: umull.4s v0, v1, v2[1]
1399 %0 = bitcast <8 x i16> %b to <2 x i64>
1400 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1401 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1402 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1403 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1404 ret <4 x i32> %vmull2.i
1407 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1408 ; CHECK-LABEL: foo9:
1409 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
1412 %0 = bitcast <4 x i32> %b to <2 x i64>
1413 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1414 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1415 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1416 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1417 ret <2 x i64> %vmull2.i
1420 define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1421 ; CHECK-LABEL: foo9a:
1422 ; CHECK-NEXT: umull.2d v0, v1, v2[1]
1425 %0 = bitcast <4 x i32> %b to <2 x i64>
1426 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1427 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1428 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1429 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1430 ret <2 x i64> %vmull2.i
1433 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1434 ; CHECK-LABEL: bar0:
1435 ; CHECK: smlal2.8h v0, v1, v2
1438 %tmp = bitcast <16 x i8> %b to <2 x i64>
1439 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1440 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1441 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1442 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1443 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1444 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1445 %add.i = add <8 x i16> %vmull.i.i.i, %a
1446 ret <8 x i16> %add.i
1449 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1450 ; CHECK-LABEL: bar1:
1451 ; CHECK: smlal2.4s v0, v1, v2
1454 %tmp = bitcast <8 x i16> %b to <2 x i64>
1455 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1456 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1457 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1458 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1459 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1460 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1461 %add.i = add <4 x i32> %vmull2.i.i.i, %a
1462 ret <4 x i32> %add.i
1465 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1466 ; CHECK-LABEL: bar2:
1467 ; CHECK: smlal2.2d v0, v1, v2
1470 %tmp = bitcast <4 x i32> %b to <2 x i64>
1471 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1472 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1473 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1474 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1475 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1476 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1477 %add.i = add <2 x i64> %vmull2.i.i.i, %a
1478 ret <2 x i64> %add.i
1481 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1482 ; CHECK-LABEL: bar3:
1483 ; CHECK: umlal2.8h v0, v1, v2
1486 %tmp = bitcast <16 x i8> %b to <2 x i64>
1487 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1488 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1489 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1490 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1491 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1492 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1493 %add.i = add <8 x i16> %vmull.i.i.i, %a
1494 ret <8 x i16> %add.i
1497 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1498 ; CHECK-LABEL: bar4:
1499 ; CHECK: umlal2.4s v0, v1, v2
1502 %tmp = bitcast <8 x i16> %b to <2 x i64>
1503 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1504 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1505 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1506 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1507 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1508 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1509 %add.i = add <4 x i32> %vmull2.i.i.i, %a
1510 ret <4 x i32> %add.i
1513 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1514 ; CHECK-LABEL: bar5:
1515 ; CHECK: umlal2.2d v0, v1, v2
1518 %tmp = bitcast <4 x i32> %b to <2 x i64>
1519 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1520 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1521 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1522 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1523 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1524 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1525 %add.i = add <2 x i64> %vmull2.i.i.i, %a
1526 ret <2 x i64> %add.i
1529 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1530 ; CHECK-LABEL: mlal2_1:
1531 ; CHECK: smlal2.4s v0, v1, v2[3]
1533 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1534 %tmp = bitcast <8 x i16> %b to <2 x i64>
1535 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1536 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1537 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1538 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1539 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1540 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1541 %add = add <4 x i32> %vmull2.i.i, %a
1545 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1546 ; CHECK-LABEL: mlal2_2:
1547 ; CHECK: smlal2.2d v0, v1, v2[1]
1549 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1550 %tmp = bitcast <4 x i32> %b to <2 x i64>
1551 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1552 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1553 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1554 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1555 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1556 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1557 %add = add <2 x i64> %vmull2.i.i, %a
1561 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1562 ; CHECK-LABEL: mlal2_4:
1563 ; CHECK: umlal2.4s v0, v1, v2[2]
1566 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1567 %tmp = bitcast <8 x i16> %b to <2 x i64>
1568 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1569 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1570 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1571 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1572 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1573 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1574 %add = add <4 x i32> %vmull2.i.i, %a
1578 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1579 ; CHECK-LABEL: mlal2_5:
1580 ; CHECK: umlal2.2d v0, v1, v2[0]
1582 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
1583 %tmp = bitcast <4 x i32> %b to <2 x i64>
1584 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1585 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1586 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1587 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1588 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1589 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1590 %add = add <2 x i64> %vmull2.i.i, %a
1595 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
1597 ; CHECK-LABEL: vmulq_n_f64:
1599 ; CHECK: fmul.2d v0, v0, v1[0]
1600 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
1601 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
1602 %mul.i = fmul <2 x double> %vecinit1.i, %x
1603 ret <2 x double> %mul.i
1606 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
1608 ; CHECK-LABEL: vmulq_n_f32:
1610 ; CHECK: fmul.4s v0, v0, v1[0]
1611 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
1612 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
1613 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
1614 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
1615 %mul.i = fmul <4 x float> %vecinit3.i, %x
1616 ret <4 x float> %mul.i
1619 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
1621 ; CHECK-LABEL: vmul_n_f32:
1623 ; CHECK: fmul.2s v0, v0, v1[0]
1624 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
1625 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
1626 %mul.i = fmul <2 x float> %vecinit1.i, %x
1627 ret <2 x float> %mul.i
1630 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1632 ; CHECK: vmla_laneq_s16_test
1634 ; CHECK: mla.4h v0, v1, v2[6]
1636 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1637 %mul = mul <4 x i16> %shuffle, %b
1638 %add = add <4 x i16> %mul, %a
1642 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1644 ; CHECK: vmla_laneq_s32_test
1646 ; CHECK: mla.2s v0, v1, v2[3]
1648 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1649 %mul = mul <2 x i32> %shuffle, %b
1650 %add = add <2 x i32> %mul, %a
1654 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1656 ; CHECK: not_really_vmlaq_laneq_s16_test
1658 ; CHECK: mla.8h v0, v1, v2[5]
1660 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1661 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1662 %mul = mul <8 x i16> %shuffle2, %b
1663 %add = add <8 x i16> %mul, %a
1667 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1669 ; CHECK: not_really_vmlaq_laneq_s32_test
1671 ; CHECK: mla.4s v0, v1, v2[3]
1673 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1674 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1675 %mul = mul <4 x i32> %shuffle2, %b
1676 %add = add <4 x i32> %mul, %a
1680 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1682 ; CHECK: vmull_laneq_s16_test
1684 ; CHECK: smull.4s v0, v0, v1[6]
1686 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1687 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1688 ret <4 x i32> %vmull2.i
1691 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1693 ; CHECK: vmull_laneq_s32_test
1695 ; CHECK: smull.2d v0, v0, v1[2]
1697 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1698 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1699 ret <2 x i64> %vmull2.i
1701 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1703 ; CHECK: vmull_laneq_u16_test
1705 ; CHECK: umull.4s v0, v0, v1[6]
1707 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1708 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1709 ret <4 x i32> %vmull2.i
1712 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1714 ; CHECK: vmull_laneq_u32_test
1716 ; CHECK: umull.2d v0, v0, v1[2]
1718 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1719 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1720 ret <2 x i64> %vmull2.i
1723 define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1725 ; CHECK: vmull_low_n_s16_test
1729 %conv = trunc i32 %d to i16
1730 %0 = bitcast <8 x i16> %b to <2 x i64>
1731 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1732 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1733 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1734 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1735 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1736 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1737 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1738 ret <4 x i32> %vmull2.i.i
1741 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1743 ; CHECK: vmull_high_n_s16_test
1747 %conv = trunc i32 %d to i16
1748 %0 = bitcast <8 x i16> %b to <2 x i64>
1749 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1750 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1751 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1752 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1753 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1754 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1755 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1756 ret <4 x i32> %vmull2.i.i
1759 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1761 ; CHECK: vmull_high_n_s32_test
1765 %0 = bitcast <4 x i32> %b to <2 x i64>
1766 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1767 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1768 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1769 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1770 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1771 ret <2 x i64> %vmull2.i.i
1774 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1776 ; CHECK: vmull_high_n_u16_test
1780 %conv = trunc i32 %d to i16
1781 %0 = bitcast <8 x i16> %b to <2 x i64>
1782 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1783 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1784 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1785 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1786 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1787 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1788 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1789 ret <4 x i32> %vmull2.i.i
1792 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1794 ; CHECK: vmull_high_n_u32_test
1798 %0 = bitcast <4 x i32> %b to <2 x i64>
1799 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1800 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1801 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1802 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1803 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1804 ret <2 x i64> %vmull2.i.i
1807 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
1808 ; CHECK-LABEL: vmul_built_dup_test:
1811 ; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
1812 %vget_lane = extractelement <4 x i32> %b, i32 1
1813 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
1814 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
1815 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
1816 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
1817 %prod = mul <4 x i32> %a, %vecinit3.i
1821 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
1822 ; CHECK-LABEL: vmul_built_dup_fromsmall_test:
1825 ; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
1826 %vget_lane = extractelement <4 x i16> %b, i32 3
1827 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1828 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1829 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1830 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1831 %prod = mul <4 x i16> %a, %vecinit3.i
1835 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
1836 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
1839 ; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
1840 %vget_lane = extractelement <4 x i16> %b, i32 0
1841 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
1842 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
1843 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1844 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1845 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
1846 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
1847 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
1848 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
1849 %prod = mul <8 x i16> %a, %vecinit7.i
1853 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
1854 ; CHECK-LABEL: mull_from_two_extracts:
1856 ; CHECK: sqdmull2.2d
1858 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1859 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1861 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1865 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1866 ; CHECK-LABEL: mlal_from_two_extracts:
1868 ; CHECK: sqdmlal2.2d
1870 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1871 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1873 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1874 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1878 define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
1879 ; CHECK-LABEL: mull_from_extract_dup_low:
1882 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1883 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1885 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1887 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1891 define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
1892 ; CHECK-LABEL: mull_from_extract_dup_high:
1894 ; CHECK: sqdmull2.2d
1895 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1896 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1898 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1900 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1904 define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
1905 ; CHECK-LABEL: pmull_from_extract_dup_low:
1908 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
1909 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1911 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1913 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
1917 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
1918 ; CHECK-LABEL: pmull_from_extract_dup_high:
1921 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
1922 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1924 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1926 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
1930 define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
1931 ; CHECK-LABEL: pmull_from_extract_duplane_low:
1935 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1936 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1938 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
1942 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
1943 ; CHECK-LABEL: pmull_from_extract_duplane_high:
1947 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1948 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1950 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
1954 define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
1955 ; CHECK-LABEL: sqdmull_from_extract_duplane_low:
1959 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1960 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1962 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1966 define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
1967 ; CHECK-LABEL: sqdmull_from_extract_duplane_high:
1969 ; CHECK: sqdmull2.2d
1971 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1972 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1974 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1978 define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1979 ; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
1983 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1984 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1986 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1987 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1991 define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1992 ; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
1994 ; CHECK: sqdmlal2.2d
1996 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1997 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1999 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2000 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2004 define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2005 ; CHECK-LABEL: umlal_from_extract_duplane_low:
2009 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2010 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2012 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2013 %sum = add <2 x i64> %accum, %res
2017 define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2018 ; CHECK-LABEL: umlal_from_extract_duplane_high:
2022 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2023 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2025 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2026 %sum = add <2 x i64> %accum, %res
2030 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2031 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
2032 ; CHECK: fmla.s s0, s1, v2[3]
2033 %rhs = extractelement <4 x float> %rvec, i32 3
2034 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2038 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2039 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
2040 ; CHECK: fmla.s s0, s1, v2[1]
2041 %rhs = extractelement <2 x float> %rvec, i32 1
2042 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2046 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2047 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
2048 ; CHECK: fmls.s s0, s1, v2[3]
2049 %rhs.scal = extractelement <4 x float> %rvec, i32 3
2050 %rhs = fsub float -0.0, %rhs.scal
2051 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2055 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2056 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
2057 ; CHECK: fmls.s s0, s1, v2[1]
2058 %rhs.scal = extractelement <2 x float> %rvec, i32 1
2059 %rhs = fsub float -0.0, %rhs.scal
2060 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2064 declare float @llvm.fma.f32(float, float, float)
2066 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2067 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
2068 ; CHECK: fmla.d d0, d1, v2[1]
2069 %rhs = extractelement <2 x double> %rvec, i32 1
2070 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2074 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2075 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
2076 ; CHECK: fmls.d d0, d1, v2[1]
2077 %rhs.scal = extractelement <2 x double> %rvec, i32 1
2078 %rhs = fsub double -0.0, %rhs.scal
2079 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2083 declare double @llvm.fma.f64(double, double, double)
2085 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
2086 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
2087 ; CHECK: fmls.2s v0, v1, v2[3]
2088 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2089 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2090 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2091 ret <2 x float> %res
2094 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
2095 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
2096 ; CHECK: fmls.2s v0, v1, v2[1]
2097 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2098 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2099 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2100 ret <2 x float> %res
2103 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
2104 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
2105 ; CHECK: fmls.4s v0, v1, v2[3]
2106 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2107 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2108 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2109 ret <4 x float> %res
2112 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
2113 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
2114 ; CHECK: fmls.4s v0, v1, v2[1]
2115 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2116 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2117 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2118 ret <4 x float> %res
2121 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
2122 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
2123 ; CHECK: fmls.2d v0, v1, v2[1]
2124 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
2125 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2126 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
2127 ret <2 x double> %res
2130 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2131 ; CHECK-LABEL: test_fmul_v1f64:
2133 %prod = fmul <1 x double> %L, %R
2134 ret <1 x double> %prod
2137 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2138 ; CHECK-LABEL: test_fdiv_v1f64:
2140 %prod = fdiv <1 x double> %L, %R
2141 ret <1 x double> %prod
2144 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2145 ;CHECK-LABEL: sqdmlal_d:
2147 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2148 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2152 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2153 ;CHECK-LABEL: sqdmlsl_d:
2155 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2156 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2160 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2161 ; CHECK-LABEL: test_pmull_64:
2163 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2167 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2168 ; CHECK-LABEL: test_pmull_high_64:
2170 %l_hi = extractelement <2 x i64> %l, i32 1
2171 %r_hi = extractelement <2 x i64> %r, i32 1
2172 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2176 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2178 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2179 ; CHECK-LABEL: test_mul_v1i64:
2181 %prod = mul <1 x i64> %lhs, %rhs