1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+aes | FileCheck %s
4 define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
5 ; CHECK-LABEL: smull8h:
7 ; CHECK-NEXT: ldr d0, [x0]
8 ; CHECK-NEXT: ldr d1, [x1]
9 ; CHECK-NEXT: smull.8h v0, v0, v1
11 %tmp1 = load <8 x i8>, ptr %A
12 %tmp2 = load <8 x i8>, ptr %B
13 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind {
18 ; CHECK-LABEL: smull4s:
20 ; CHECK-NEXT: ldr d0, [x0]
21 ; CHECK-NEXT: ldr d1, [x1]
22 ; CHECK-NEXT: smull.4s v0, v0, v1
24 %tmp1 = load <4 x i16>, ptr %A
25 %tmp2 = load <4 x i16>, ptr %B
26 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
30 define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
31 ; CHECK-LABEL: smull2d:
33 ; CHECK-NEXT: ldr d0, [x0]
34 ; CHECK-NEXT: ldr d1, [x1]
35 ; CHECK-NEXT: smull.2d v0, v0, v1
37 %tmp1 = load <2 x i32>, ptr %A
38 %tmp2 = load <2 x i32>, ptr %B
39 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
43 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
44 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
45 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
47 define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind {
48 ; CHECK-LABEL: umull8h:
50 ; CHECK-NEXT: ldr d0, [x0]
51 ; CHECK-NEXT: ldr d1, [x1]
52 ; CHECK-NEXT: umull.8h v0, v0, v1
54 %tmp1 = load <8 x i8>, ptr %A
55 %tmp2 = load <8 x i8>, ptr %B
56 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
60 define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind {
61 ; CHECK-LABEL: umull4s:
63 ; CHECK-NEXT: ldr d0, [x0]
64 ; CHECK-NEXT: ldr d1, [x1]
65 ; CHECK-NEXT: umull.4s v0, v0, v1
67 %tmp1 = load <4 x i16>, ptr %A
68 %tmp2 = load <4 x i16>, ptr %B
69 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
73 define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
74 ; CHECK-LABEL: umull2d:
76 ; CHECK-NEXT: ldr d0, [x0]
77 ; CHECK-NEXT: ldr d1, [x1]
78 ; CHECK-NEXT: umull.2d v0, v0, v1
80 %tmp1 = load <2 x i32>, ptr %A
81 %tmp2 = load <2 x i32>, ptr %B
82 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
86 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
87 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
88 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
90 define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind {
91 ; CHECK-LABEL: sqdmull4s:
93 ; CHECK-NEXT: ldr d0, [x0]
94 ; CHECK-NEXT: ldr d1, [x1]
95 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
97 %tmp1 = load <4 x i16>, ptr %A
98 %tmp2 = load <4 x i16>, ptr %B
99 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
103 define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind {
104 ; CHECK-LABEL: sqdmull2d:
106 ; CHECK-NEXT: ldr d0, [x0]
107 ; CHECK-NEXT: ldr d1, [x1]
108 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
110 %tmp1 = load <2 x i32>, ptr %A
111 %tmp2 = load <2 x i32>, ptr %B
112 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
116 define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind {
117 ; CHECK-LABEL: sqdmull2_4s:
119 ; CHECK-NEXT: ldr d0, [x0, #8]
120 ; CHECK-NEXT: ldr d1, [x1, #8]
121 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
123 %load1 = load <8 x i16>, ptr %A
124 %load2 = load <8 x i16>, ptr %B
125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
131 define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind {
132 ; CHECK-LABEL: sqdmull2_2d:
134 ; CHECK-NEXT: ldr d0, [x0, #8]
135 ; CHECK-NEXT: ldr d1, [x1, #8]
136 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
138 %load1 = load <4 x i32>, ptr %A
139 %load2 = load <4 x i32>, ptr %B
140 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
141 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
142 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
147 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
148 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
150 define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
151 ; CHECK-LABEL: pmull8h:
153 ; CHECK-NEXT: ldr d0, [x0]
154 ; CHECK-NEXT: ldr d1, [x1]
155 ; CHECK-NEXT: pmull.8h v0, v0, v1
157 %tmp1 = load <8 x i8>, ptr %A
158 %tmp2 = load <8 x i8>, ptr %B
159 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
163 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
165 define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind {
166 ; CHECK-LABEL: sqdmulh_4h:
168 ; CHECK-NEXT: ldr d0, [x0]
169 ; CHECK-NEXT: ldr d1, [x1]
170 ; CHECK-NEXT: sqdmulh.4h v0, v0, v1
172 %tmp1 = load <4 x i16>, ptr %A
173 %tmp2 = load <4 x i16>, ptr %B
174 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
178 define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind {
179 ; CHECK-LABEL: sqdmulh_8h:
181 ; CHECK-NEXT: ldr q0, [x0]
182 ; CHECK-NEXT: ldr q1, [x1]
183 ; CHECK-NEXT: sqdmulh.8h v0, v0, v1
185 %tmp1 = load <8 x i16>, ptr %A
186 %tmp2 = load <8 x i16>, ptr %B
187 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
191 define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind {
192 ; CHECK-LABEL: sqdmulh_2s:
194 ; CHECK-NEXT: ldr d0, [x0]
195 ; CHECK-NEXT: ldr d1, [x1]
196 ; CHECK-NEXT: sqdmulh.2s v0, v0, v1
198 %tmp1 = load <2 x i32>, ptr %A
199 %tmp2 = load <2 x i32>, ptr %B
200 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
204 define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind {
205 ; CHECK-LABEL: sqdmulh_4s:
207 ; CHECK-NEXT: ldr q0, [x0]
208 ; CHECK-NEXT: ldr q1, [x1]
209 ; CHECK-NEXT: sqdmulh.4s v0, v0, v1
211 %tmp1 = load <4 x i32>, ptr %A
212 %tmp2 = load <4 x i32>, ptr %B
213 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
217 define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind {
218 ; CHECK-LABEL: sqdmulh_1s:
220 ; CHECK-NEXT: ldr w8, [x0]
221 ; CHECK-NEXT: ldr w9, [x1]
222 ; CHECK-NEXT: fmov s0, w8
223 ; CHECK-NEXT: fmov s1, w9
224 ; CHECK-NEXT: sqdmulh s0, s0, s1
225 ; CHECK-NEXT: fmov w0, s0
227 %tmp1 = load i32, ptr %A
228 %tmp2 = load i32, ptr %B
229 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
233 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
234 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
235 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
236 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
237 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
239 define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind {
240 ; CHECK-LABEL: sqrdmulh_4h:
242 ; CHECK-NEXT: ldr d0, [x0]
243 ; CHECK-NEXT: ldr d1, [x1]
244 ; CHECK-NEXT: sqrdmulh.4h v0, v0, v1
246 %tmp1 = load <4 x i16>, ptr %A
247 %tmp2 = load <4 x i16>, ptr %B
248 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
252 define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind {
253 ; CHECK-LABEL: sqrdmulh_8h:
255 ; CHECK-NEXT: ldr q0, [x0]
256 ; CHECK-NEXT: ldr q1, [x1]
257 ; CHECK-NEXT: sqrdmulh.8h v0, v0, v1
259 %tmp1 = load <8 x i16>, ptr %A
260 %tmp2 = load <8 x i16>, ptr %B
261 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
265 define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind {
266 ; CHECK-LABEL: sqrdmulh_2s:
268 ; CHECK-NEXT: ldr d0, [x0]
269 ; CHECK-NEXT: ldr d1, [x1]
270 ; CHECK-NEXT: sqrdmulh.2s v0, v0, v1
272 %tmp1 = load <2 x i32>, ptr %A
273 %tmp2 = load <2 x i32>, ptr %B
274 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
278 define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind {
279 ; CHECK-LABEL: sqrdmulh_4s:
281 ; CHECK-NEXT: ldr q0, [x0]
282 ; CHECK-NEXT: ldr q1, [x1]
283 ; CHECK-NEXT: sqrdmulh.4s v0, v0, v1
285 %tmp1 = load <4 x i32>, ptr %A
286 %tmp2 = load <4 x i32>, ptr %B
287 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
291 define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind {
292 ; CHECK-LABEL: sqrdmulh_1s:
294 ; CHECK-NEXT: ldr w8, [x0]
295 ; CHECK-NEXT: ldr w9, [x1]
296 ; CHECK-NEXT: fmov s0, w8
297 ; CHECK-NEXT: fmov s1, w9
298 ; CHECK-NEXT: sqrdmulh s0, s0, s1
299 ; CHECK-NEXT: fmov w0, s0
301 %tmp1 = load i32, ptr %A
302 %tmp2 = load i32, ptr %B
303 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
307 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
308 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
309 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
310 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
311 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
313 define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind {
314 ; CHECK-LABEL: fmulx_2s:
316 ; CHECK-NEXT: ldr d0, [x0]
317 ; CHECK-NEXT: ldr d1, [x1]
318 ; CHECK-NEXT: fmulx.2s v0, v0, v1
320 %tmp1 = load <2 x float>, ptr %A
321 %tmp2 = load <2 x float>, ptr %B
322 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
323 ret <2 x float> %tmp3
326 define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind {
327 ; CHECK-LABEL: fmulx_4s:
329 ; CHECK-NEXT: ldr q0, [x0]
330 ; CHECK-NEXT: ldr q1, [x1]
331 ; CHECK-NEXT: fmulx.4s v0, v0, v1
333 %tmp1 = load <4 x float>, ptr %A
334 %tmp2 = load <4 x float>, ptr %B
335 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
336 ret <4 x float> %tmp3
339 define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind {
340 ; CHECK-LABEL: fmulx_2d:
342 ; CHECK-NEXT: ldr q0, [x0]
343 ; CHECK-NEXT: ldr q1, [x1]
344 ; CHECK-NEXT: fmulx.2d v0, v0, v1
346 %tmp1 = load <2 x double>, ptr %A
347 %tmp2 = load <2 x double>, ptr %B
348 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
349 ret <2 x double> %tmp3
352 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
353 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
354 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
356 define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
357 ; CHECK-LABEL: smlal4s:
359 ; CHECK-NEXT: ldr d1, [x0]
360 ; CHECK-NEXT: ldr d2, [x1]
361 ; CHECK-NEXT: ldr q0, [x2]
362 ; CHECK-NEXT: smlal.4s v0, v1, v2
364 %tmp1 = load <4 x i16>, ptr %A
365 %tmp2 = load <4 x i16>, ptr %B
366 %tmp3 = load <4 x i32>, ptr %C
367 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
368 %tmp5 = add <4 x i32> %tmp3, %tmp4
372 define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind {
373 ; CHECK-LABEL: smlal2d:
375 ; CHECK-NEXT: ldr d1, [x0]
376 ; CHECK-NEXT: ldr d2, [x1]
377 ; CHECK-NEXT: ldr q0, [x2]
378 ; CHECK-NEXT: smlal.2d v0, v1, v2
380 %tmp1 = load <2 x i32>, ptr %A
381 %tmp2 = load <2 x i32>, ptr %B
382 %tmp3 = load <2 x i64>, ptr %C
383 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
384 %tmp5 = add <2 x i64> %tmp3, %tmp4
388 define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
389 ; CHECK-LABEL: smlal8h_chain_with_constant:
391 ; CHECK-NEXT: movi.16b v3, #1
392 ; CHECK-NEXT: smlal.8h v3, v0, v2
393 ; CHECK-NEXT: mvn.8b v0, v2
394 ; CHECK-NEXT: smlal.8h v3, v1, v0
395 ; CHECK-NEXT: str q3, [x0]
397 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
398 %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
399 %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
400 %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
401 %add.2 = add <8 x i16> %add.1, %smull.2
402 store <8 x i16> %add.2, ptr %dst
406 define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
407 ; CHECK-LABEL: smlal2d_chain_with_constant:
409 ; CHECK-NEXT: mov w8, #257 // =0x101
410 ; CHECK-NEXT: dup.2d v3, x8
411 ; CHECK-NEXT: smlal.2d v3, v0, v2
412 ; CHECK-NEXT: mvn.8b v0, v2
413 ; CHECK-NEXT: smlal.2d v3, v1, v0
414 ; CHECK-NEXT: str q3, [x0]
416 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
417 %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
418 %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257>
419 %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
420 %add.2 = add <2 x i64> %add.1, %smull.2
421 store <2 x i64> %add.2, ptr %dst
425 define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
426 ; CHECK-LABEL: smlsl4s:
428 ; CHECK-NEXT: ldr d1, [x0]
429 ; CHECK-NEXT: ldr d2, [x1]
430 ; CHECK-NEXT: ldr q0, [x2]
431 ; CHECK-NEXT: smlsl.4s v0, v1, v2
433 %tmp1 = load <4 x i16>, ptr %A
434 %tmp2 = load <4 x i16>, ptr %B
435 %tmp3 = load <4 x i32>, ptr %C
436 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
437 %tmp5 = sub <4 x i32> %tmp3, %tmp4
441 define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
442 ; CHECK-LABEL: smlsl2d:
444 ; CHECK-NEXT: ldr d1, [x0]
445 ; CHECK-NEXT: ldr d2, [x1]
446 ; CHECK-NEXT: ldr q0, [x2]
447 ; CHECK-NEXT: smlsl.2d v0, v1, v2
449 %tmp1 = load <2 x i32>, ptr %A
450 %tmp2 = load <2 x i32>, ptr %B
451 %tmp3 = load <2 x i64>, ptr %C
452 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
453 %tmp5 = sub <2 x i64> %tmp3, %tmp4
457 define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
458 ; CHECK-LABEL: smlsl8h_chain_with_constant:
460 ; CHECK-NEXT: movi.16b v3, #1
461 ; CHECK-NEXT: smlsl.8h v3, v0, v2
462 ; CHECK-NEXT: mvn.8b v0, v2
463 ; CHECK-NEXT: smlsl.8h v3, v1, v0
464 ; CHECK-NEXT: str q3, [x0]
466 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
467 %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
468 %sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1
469 %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
470 %sub.2 = sub <8 x i16> %sub.1, %smull.2
471 store <8 x i16> %sub.2, ptr %dst
475 define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
476 ; CHECK-LABEL: smlsl2d_chain_with_constant:
478 ; CHECK-NEXT: mov w8, #257 // =0x101
479 ; CHECK-NEXT: dup.2d v3, x8
480 ; CHECK-NEXT: smlsl.2d v3, v0, v2
481 ; CHECK-NEXT: mvn.8b v0, v2
482 ; CHECK-NEXT: smlsl.2d v3, v1, v0
483 ; CHECK-NEXT: str q3, [x0]
485 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
486 %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
487 %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1
488 %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
489 %sub.2 = sub <2 x i64> %sub.1, %smull.2
490 store <2 x i64> %sub.2, ptr %dst
494 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
495 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
496 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
497 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
499 define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
500 ; CHECK-LABEL: sqdmlal4s:
502 ; CHECK-NEXT: ldr d1, [x0]
503 ; CHECK-NEXT: ldr d2, [x1]
504 ; CHECK-NEXT: ldr q0, [x2]
505 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2
507 %tmp1 = load <4 x i16>, ptr %A
508 %tmp2 = load <4 x i16>, ptr %B
509 %tmp3 = load <4 x i32>, ptr %C
510 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
511 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
515 define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind {
516 ; CHECK-LABEL: sqdmlal2d:
518 ; CHECK-NEXT: ldr d1, [x0]
519 ; CHECK-NEXT: ldr d2, [x1]
520 ; CHECK-NEXT: ldr q0, [x2]
521 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2
523 %tmp1 = load <2 x i32>, ptr %A
524 %tmp2 = load <2 x i32>, ptr %B
525 %tmp3 = load <2 x i64>, ptr %C
526 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
527 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
531 define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
532 ; CHECK-LABEL: sqdmlal2_4s:
534 ; CHECK-NEXT: ldr q0, [x2]
535 ; CHECK-NEXT: ldr d1, [x0, #8]
536 ; CHECK-NEXT: ldr d2, [x1, #8]
537 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2
539 %load1 = load <8 x i16>, ptr %A
540 %load2 = load <8 x i16>, ptr %B
541 %tmp3 = load <4 x i32>, ptr %C
542 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
543 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
544 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
545 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
549 define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
550 ; CHECK-LABEL: sqdmlal2_2d:
552 ; CHECK-NEXT: ldr q0, [x2]
553 ; CHECK-NEXT: ldr d1, [x0, #8]
554 ; CHECK-NEXT: ldr d2, [x1, #8]
555 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2
557 %load1 = load <4 x i32>, ptr %A
558 %load2 = load <4 x i32>, ptr %B
559 %tmp3 = load <2 x i64>, ptr %C
560 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
561 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
562 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
563 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
567 define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
568 ; CHECK-LABEL: sqdmlsl4s:
570 ; CHECK-NEXT: ldr d1, [x0]
571 ; CHECK-NEXT: ldr d2, [x1]
572 ; CHECK-NEXT: ldr q0, [x2]
573 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2
575 %tmp1 = load <4 x i16>, ptr %A
576 %tmp2 = load <4 x i16>, ptr %B
577 %tmp3 = load <4 x i32>, ptr %C
578 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
579 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
583 define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
584 ; CHECK-LABEL: sqdmlsl2d:
586 ; CHECK-NEXT: ldr d1, [x0]
587 ; CHECK-NEXT: ldr d2, [x1]
588 ; CHECK-NEXT: ldr q0, [x2]
589 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2
591 %tmp1 = load <2 x i32>, ptr %A
592 %tmp2 = load <2 x i32>, ptr %B
593 %tmp3 = load <2 x i64>, ptr %C
594 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
595 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
599 define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind {
600 ; CHECK-LABEL: sqdmlsl2_4s:
602 ; CHECK-NEXT: ldr q0, [x2]
603 ; CHECK-NEXT: ldr d1, [x0, #8]
604 ; CHECK-NEXT: ldr d2, [x1, #8]
605 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2
607 %load1 = load <8 x i16>, ptr %A
608 %load2 = load <8 x i16>, ptr %B
609 %tmp3 = load <4 x i32>, ptr %C
610 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
611 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
612 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
613 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
617 define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind {
618 ; CHECK-LABEL: sqdmlsl2_2d:
620 ; CHECK-NEXT: ldr q0, [x2]
621 ; CHECK-NEXT: ldr d1, [x0, #8]
622 ; CHECK-NEXT: ldr d2, [x1, #8]
623 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2
625 %load1 = load <4 x i32>, ptr %A
626 %load2 = load <4 x i32>, ptr %B
627 %tmp3 = load <2 x i64>, ptr %C
628 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
629 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
630 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
631 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
635 define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
636 ; CHECK-LABEL: umlal4s:
638 ; CHECK-NEXT: ldr d1, [x0]
639 ; CHECK-NEXT: ldr d2, [x1]
640 ; CHECK-NEXT: ldr q0, [x2]
641 ; CHECK-NEXT: umlal.4s v0, v1, v2
643 %tmp1 = load <4 x i16>, ptr %A
644 %tmp2 = load <4 x i16>, ptr %B
645 %tmp3 = load <4 x i32>, ptr %C
646 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
647 %tmp5 = add <4 x i32> %tmp3, %tmp4
651 define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind {
652 ; CHECK-LABEL: umlal2d:
654 ; CHECK-NEXT: ldr d1, [x0]
655 ; CHECK-NEXT: ldr d2, [x1]
656 ; CHECK-NEXT: ldr q0, [x2]
657 ; CHECK-NEXT: umlal.2d v0, v1, v2
659 %tmp1 = load <2 x i32>, ptr %A
660 %tmp2 = load <2 x i32>, ptr %B
661 %tmp3 = load <2 x i64>, ptr %C
662 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
663 %tmp5 = add <2 x i64> %tmp3, %tmp4
667 define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
668 ; CHECK-LABEL: umlal8h_chain_with_constant:
670 ; CHECK-NEXT: movi.16b v3, #1
671 ; CHECK-NEXT: umlal.8h v3, v0, v2
672 ; CHECK-NEXT: mvn.8b v0, v2
673 ; CHECK-NEXT: umlal.8h v3, v1, v0
674 ; CHECK-NEXT: str q3, [x0]
676 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
677 %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
678 %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
679 %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
680 %add.2 = add <8 x i16> %add.1, %umull.2
681 store <8 x i16> %add.2, ptr %dst
685 define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
686 ; CHECK-LABEL: umlal2d_chain_with_constant:
688 ; CHECK-NEXT: mov w8, #257 // =0x101
689 ; CHECK-NEXT: dup.2d v3, x8
690 ; CHECK-NEXT: umlal.2d v3, v0, v2
691 ; CHECK-NEXT: mvn.8b v0, v2
692 ; CHECK-NEXT: umlal.2d v3, v1, v0
693 ; CHECK-NEXT: str q3, [x0]
695 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
696 %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
697 %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257>
698 %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
699 %add.2 = add <2 x i64> %add.1, %umull.2
700 store <2 x i64> %add.2, ptr %dst
704 define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
705 ; CHECK-LABEL: umlsl4s:
707 ; CHECK-NEXT: ldr d1, [x0]
708 ; CHECK-NEXT: ldr d2, [x1]
709 ; CHECK-NEXT: ldr q0, [x2]
710 ; CHECK-NEXT: umlsl.4s v0, v1, v2
712 %tmp1 = load <4 x i16>, ptr %A
713 %tmp2 = load <4 x i16>, ptr %B
714 %tmp3 = load <4 x i32>, ptr %C
715 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
716 %tmp5 = sub <4 x i32> %tmp3, %tmp4
720 define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
721 ; CHECK-LABEL: umlsl2d:
723 ; CHECK-NEXT: ldr d1, [x0]
724 ; CHECK-NEXT: ldr d2, [x1]
725 ; CHECK-NEXT: ldr q0, [x2]
726 ; CHECK-NEXT: umlsl.2d v0, v1, v2
728 %tmp1 = load <2 x i32>, ptr %A
729 %tmp2 = load <2 x i32>, ptr %B
730 %tmp3 = load <2 x i64>, ptr %C
731 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
732 %tmp5 = sub <2 x i64> %tmp3, %tmp4
736 define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
737 ; CHECK-LABEL: umlsl8h_chain_with_constant:
739 ; CHECK-NEXT: movi.16b v3, #1
740 ; CHECK-NEXT: umlsl.8h v3, v0, v2
741 ; CHECK-NEXT: mvn.8b v0, v2
742 ; CHECK-NEXT: umlsl.8h v3, v1, v0
743 ; CHECK-NEXT: str q3, [x0]
745 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
746 %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
747 %add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1
748 %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
749 %add.2 = sub <8 x i16> %add.1, %umull.2
750 store <8 x i16> %add.2, ptr %dst
754 define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
755 ; CHECK-LABEL: umlsl2d_chain_with_constant:
757 ; CHECK-NEXT: mov w8, #257 // =0x101
758 ; CHECK-NEXT: dup.2d v3, x8
759 ; CHECK-NEXT: umlsl.2d v3, v0, v2
760 ; CHECK-NEXT: mvn.8b v0, v2
761 ; CHECK-NEXT: umlsl.2d v3, v1, v0
762 ; CHECK-NEXT: str q3, [x0]
764 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
765 %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
766 %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1
767 %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
768 %add.2 = sub <2 x i64> %add.1, %umull.2
769 store <2 x i64> %add.2, ptr %dst
773 define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
774 ; CHECK-LABEL: fmla_2s:
776 ; CHECK-NEXT: ldr d1, [x0]
777 ; CHECK-NEXT: ldr d2, [x1]
778 ; CHECK-NEXT: ldr d0, [x2]
779 ; CHECK-NEXT: fmla.2s v0, v2, v1
781 %tmp1 = load <2 x float>, ptr %A
782 %tmp2 = load <2 x float>, ptr %B
783 %tmp3 = load <2 x float>, ptr %C
784 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
785 ret <2 x float> %tmp4
788 define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
789 ; CHECK-LABEL: fmla_4s:
791 ; CHECK-NEXT: ldr q1, [x0]
792 ; CHECK-NEXT: ldr q2, [x1]
793 ; CHECK-NEXT: ldr q0, [x2]
794 ; CHECK-NEXT: fmla.4s v0, v2, v1
796 %tmp1 = load <4 x float>, ptr %A
797 %tmp2 = load <4 x float>, ptr %B
798 %tmp3 = load <4 x float>, ptr %C
799 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
800 ret <4 x float> %tmp4
803 define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind {
804 ; CHECK-LABEL: fmla_2d:
806 ; CHECK-NEXT: ldr q1, [x0]
807 ; CHECK-NEXT: ldr q2, [x1]
808 ; CHECK-NEXT: ldr q0, [x2]
809 ; CHECK-NEXT: fmla.2d v0, v2, v1
811 %tmp1 = load <2 x double>, ptr %A
812 %tmp2 = load <2 x double>, ptr %B
813 %tmp3 = load <2 x double>, ptr %C
814 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
815 ret <2 x double> %tmp4
818 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
819 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
820 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
822 define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
823 ; CHECK-LABEL: fmls_2s:
825 ; CHECK-NEXT: ldr d1, [x0]
826 ; CHECK-NEXT: ldr d2, [x1]
827 ; CHECK-NEXT: ldr d0, [x2]
828 ; CHECK-NEXT: fmls.2s v0, v1, v2
830 %tmp1 = load <2 x float>, ptr %A
831 %tmp2 = load <2 x float>, ptr %B
832 %tmp3 = load <2 x float>, ptr %C
833 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
834 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
835 ret <2 x float> %tmp5
838 define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
839 ; CHECK-LABEL: fmls_4s:
841 ; CHECK-NEXT: ldr q1, [x0]
842 ; CHECK-NEXT: ldr q2, [x1]
843 ; CHECK-NEXT: ldr q0, [x2]
844 ; CHECK-NEXT: fmls.4s v0, v1, v2
846 %tmp1 = load <4 x float>, ptr %A
847 %tmp2 = load <4 x float>, ptr %B
848 %tmp3 = load <4 x float>, ptr %C
849 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
850 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
851 ret <4 x float> %tmp5
854 define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
855 ; CHECK-LABEL: fmls_2d:
857 ; CHECK-NEXT: ldr q1, [x0]
858 ; CHECK-NEXT: ldr q2, [x1]
859 ; CHECK-NEXT: ldr q0, [x2]
860 ; CHECK-NEXT: fmls.2d v0, v1, v2
862 %tmp1 = load <2 x double>, ptr %A
863 %tmp2 = load <2 x double>, ptr %B
864 %tmp3 = load <2 x double>, ptr %C
865 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
866 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
867 ret <2 x double> %tmp5
870 define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
871 ; CHECK-LABEL: fmls_commuted_neg_2s:
873 ; CHECK-NEXT: ldr d1, [x0]
874 ; CHECK-NEXT: ldr d2, [x1]
875 ; CHECK-NEXT: ldr d0, [x2]
876 ; CHECK-NEXT: fmls.2s v0, v1, v2
878 %tmp1 = load <2 x float>, ptr %A
879 %tmp2 = load <2 x float>, ptr %B
880 %tmp3 = load <2 x float>, ptr %C
881 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
882 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
883 ret <2 x float> %tmp5
886 define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
887 ; CHECK-LABEL: fmls_commuted_neg_4s:
889 ; CHECK-NEXT: ldr q1, [x0]
890 ; CHECK-NEXT: ldr q2, [x1]
891 ; CHECK-NEXT: ldr q0, [x2]
892 ; CHECK-NEXT: fmls.4s v0, v1, v2
894 %tmp1 = load <4 x float>, ptr %A
895 %tmp2 = load <4 x float>, ptr %B
896 %tmp3 = load <4 x float>, ptr %C
897 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
898 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
899 ret <4 x float> %tmp5
902 define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind {
903 ; CHECK-LABEL: fmls_commuted_neg_2d:
905 ; CHECK-NEXT: ldr q1, [x0]
906 ; CHECK-NEXT: ldr q2, [x1]
907 ; CHECK-NEXT: ldr q0, [x2]
908 ; CHECK-NEXT: fmls.2d v0, v1, v2
910 %tmp1 = load <2 x double>, ptr %A
911 %tmp2 = load <2 x double>, ptr %B
912 %tmp3 = load <2 x double>, ptr %C
913 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
914 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
915 ret <2 x double> %tmp5
918 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
919 ; CHECK-LABEL: fmls_indexed_2s:
920 ; CHECK: // %bb.0: // %entry
921 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
922 ; CHECK-NEXT: fmls.2s v0, v2, v1[0]
925 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
926 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
927 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
928 ret <2 x float> %fmls1
931 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
932 ; CHECK-LABEL: fmls_indexed_4s:
933 ; CHECK: // %bb.0: // %entry
934 ; CHECK-NEXT: fmls.4s v0, v2, v1[0]
937 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
938 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
939 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
940 ret <4 x float> %fmls1
943 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
944 ; CHECK-LABEL: fmls_indexed_2d:
945 ; CHECK: // %bb.0: // %entry
946 ; CHECK-NEXT: fmls.2d v0, v2, v1[0]
949 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
950 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
951 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
952 ret <2 x double> %fmls1
955 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
956 ; CHECK-LABEL: fmla_indexed_scalar_2s:
957 ; CHECK: // %bb.0: // %entry
958 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2
959 ; CHECK-NEXT: fmla.2s v0, v1, v2
962 %v1 = insertelement <2 x float> undef, float %c, i32 0
963 %v2 = insertelement <2 x float> %v1, float %c, i32 1
964 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
965 ret <2 x float> %fmla1
968 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
969 ; CHECK-LABEL: fmla_indexed_scalar_4s:
970 ; CHECK: // %bb.0: // %entry
971 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
972 ; CHECK-NEXT: fmla.4s v0, v1, v2[0]
975 %v1 = insertelement <4 x float> undef, float %c, i32 0
976 %v2 = insertelement <4 x float> %v1, float %c, i32 1
977 %v3 = insertelement <4 x float> %v2, float %c, i32 2
978 %v4 = insertelement <4 x float> %v3, float %c, i32 3
979 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
980 ret <4 x float> %fmla1
983 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
984 ; CHECK-LABEL: fmla_indexed_scalar_2d:
985 ; CHECK: // %bb.0: // %entry
986 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
987 ; CHECK-NEXT: fmla.2d v0, v1, v2[0]
990 %v1 = insertelement <2 x double> undef, double %c, i32 0
991 %v2 = insertelement <2 x double> %v1, double %c, i32 1
992 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
993 ret <2 x double> %fmla1
996 define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp {
997 ; CHECK-LABEL: fmls_indexed_2s_strict:
998 ; CHECK: // %bb.0: // %entry
999 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1000 ; CHECK-NEXT: fmls.2s v0, v2, v1[0]
1003 %0 = fneg <2 x float> %c
1004 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
1005 %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1006 ret <2 x float> %fmls1
1009 define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp {
1010 ; CHECK-LABEL: fmls_indexed_4s_strict:
1011 ; CHECK: // %bb.0: // %entry
1012 ; CHECK-NEXT: fmls.4s v0, v2, v1[0]
1015 %0 = fneg <4 x float> %c
1016 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
1017 %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1018 ret <4 x float> %fmls1
1021 define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp {
1022 ; CHECK-LABEL: fmls_indexed_2d_strict:
1023 ; CHECK: // %bb.0: // %entry
1024 ; CHECK-NEXT: fmls.2d v0, v2, v1[0]
1027 %0 = fneg <2 x double> %c
1028 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
1029 %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1030 ret <2 x double> %fmls1
1033 define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp {
1034 ; CHECK-LABEL: fmla_indexed_scalar_2s_strict:
1035 ; CHECK: // %bb.0: // %entry
1036 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
1037 ; CHECK-NEXT: fmla.2s v0, v1, v2[0]
1040 %v1 = insertelement <2 x float> undef, float %c, i32 0
1041 %v2 = insertelement <2 x float> %v1, float %c, i32 1
1042 %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1043 ret <2 x float> %fmla1
1046 define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp {
1047 ; CHECK-LABEL: fmla_indexed_scalar_4s_strict:
1048 ; CHECK: // %bb.0: // %entry
1049 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
1050 ; CHECK-NEXT: fmla.4s v0, v1, v2[0]
1053 %v1 = insertelement <4 x float> undef, float %c, i32 0
1054 %v2 = insertelement <4 x float> %v1, float %c, i32 1
1055 %v3 = insertelement <4 x float> %v2, float %c, i32 2
1056 %v4 = insertelement <4 x float> %v3, float %c, i32 3
1057 %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1058 ret <4 x float> %fmla1
1061 define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp {
1062 ; CHECK-LABEL: fmla_indexed_scalar_2d_strict:
1063 ; CHECK: // %bb.0: // %entry
1064 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1065 ; CHECK-NEXT: fmla.2d v0, v1, v2[0]
1068 %v1 = insertelement <2 x double> undef, double %c, i32 0
1069 %v2 = insertelement <2 x double> %v1, double %c, i32 1
1070 %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1071 ret <2 x double> %fmla1
1074 attributes #0 = { strictfp }
1076 declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
1077 declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
1078 declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
1080 define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1081 ; CHECK-LABEL: mul_4h:
1083 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1084 ; CHECK-NEXT: mul.4h v0, v0, v1[1]
1086 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1087 %tmp4 = mul <4 x i16> %A, %tmp3
1091 define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1092 ; CHECK-LABEL: mul_8h:
1094 ; CHECK-NEXT: mul.8h v0, v0, v1[1]
1096 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1097 %tmp4 = mul <8 x i16> %A, %tmp3
1101 define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1102 ; CHECK-LABEL: mul_2s:
1104 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1105 ; CHECK-NEXT: mul.2s v0, v0, v1[1]
1107 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1108 %tmp4 = mul <2 x i32> %A, %tmp3
1112 define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1113 ; CHECK-LABEL: mul_4s:
1115 ; CHECK-NEXT: mul.4s v0, v0, v1[1]
1117 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1118 %tmp4 = mul <4 x i32> %A, %tmp3
1122 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
1123 ; CHECK-LABEL: mul_2d:
1125 ; CHECK-NEXT: fmov x10, d1
1126 ; CHECK-NEXT: fmov x11, d0
1127 ; CHECK-NEXT: mov.d x8, v1[1]
1128 ; CHECK-NEXT: mov.d x9, v0[1]
1129 ; CHECK-NEXT: mul x10, x11, x10
1130 ; CHECK-NEXT: mul x8, x9, x8
1131 ; CHECK-NEXT: fmov d0, x10
1132 ; CHECK-NEXT: mov.d v0[1], x8
1134 %tmp1 = mul <2 x i64> %A, %B
1138 define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
1139 ; CHECK-LABEL: fmul_lane_2s:
1141 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1142 ; CHECK-NEXT: fmul.2s v0, v0, v1[1]
1144 %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1145 %tmp4 = fmul <2 x float> %A, %tmp3
1146 ret <2 x float> %tmp4
1149 define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
1150 ; CHECK-LABEL: fmul_lane_4s:
1152 ; CHECK-NEXT: fmul.4s v0, v0, v1[1]
1154 %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1155 %tmp4 = fmul <4 x float> %A, %tmp3
1156 ret <4 x float> %tmp4
1159 define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
1160 ; CHECK-LABEL: fmul_lane_2d:
1162 ; CHECK-NEXT: fmul.2d v0, v0, v1[1]
1164 %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
1165 %tmp4 = fmul <2 x double> %A, %tmp3
1166 ret <2 x double> %tmp4
1169 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
1170 ; CHECK-LABEL: fmul_lane_s:
1172 ; CHECK-NEXT: fmul.s s0, s0, v1[3]
1174 %B = extractelement <4 x float> %vec, i32 3
1175 %res = fmul float %A, %B
1179 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
1180 ; CHECK-LABEL: fmul_lane_d:
1182 ; CHECK-NEXT: fmul.d d0, d0, v1[1]
1184 %B = extractelement <2 x double> %vec, i32 1
1185 %res = fmul double %A, %B
1191 define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
1192 ; CHECK-LABEL: fmulx_lane_2s:
1194 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1195 ; CHECK-NEXT: fmulx.2s v0, v0, v1[1]
1197 %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1198 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3)
1199 ret <2 x float> %tmp4
1202 define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
1203 ; CHECK-LABEL: fmulx_lane_4s:
1205 ; CHECK-NEXT: fmulx.4s v0, v0, v1[1]
1207 %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1208 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3)
1209 ret <4 x float> %tmp4
1212 define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
1213 ; CHECK-LABEL: fmulx_lane_2d:
1215 ; CHECK-NEXT: fmulx.2d v0, v0, v1[1]
1217 %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
1218 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3)
1219 ret <2 x double> %tmp4
1222 define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1223 ; CHECK-LABEL: sqdmulh_lane_4h:
1225 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1226 ; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1]
1228 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1229 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
1233 define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1234 ; CHECK-LABEL: sqdmulh_lane_8h:
1236 ; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1]
1238 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1239 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
1243 define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1244 ; CHECK-LABEL: sqdmulh_lane_2s:
1246 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1247 ; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1]
1249 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1250 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
1254 define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1255 ; CHECK-LABEL: sqdmulh_lane_4s:
1257 ; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1]
1259 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1260 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
1264 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1265 ; CHECK-LABEL: sqdmulh_lane_1s:
1267 ; CHECK-NEXT: fmov s1, w0
1268 ; CHECK-NEXT: sqdmulh.s s0, s1, v0[1]
1269 ; CHECK-NEXT: fmov w0, s0
1271 %tmp1 = extractelement <4 x i32> %B, i32 1
1272 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
1276 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1277 ; CHECK-LABEL: sqrdmulh_lane_4h:
1279 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1280 ; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1]
1282 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1283 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
1287 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1288 ; CHECK-LABEL: sqrdmulh_lane_8h:
1290 ; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1]
1292 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1293 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
1297 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1298 ; CHECK-LABEL: sqrdmulh_lane_2s:
1300 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1301 ; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1]
1303 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1304 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
1308 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1309 ; CHECK-LABEL: sqrdmulh_lane_4s:
1311 ; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1]
1313 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1314 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
1318 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1319 ; CHECK-LABEL: sqrdmulh_lane_1s:
1321 ; CHECK-NEXT: fmov s1, w0
1322 ; CHECK-NEXT: sqrdmulh.s s0, s1, v0[1]
1323 ; CHECK-NEXT: fmov w0, s0
1325 %tmp1 = extractelement <4 x i32> %B, i32 1
1326 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
1330 define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1331 ; CHECK-LABEL: sqdmull_lane_4s:
1333 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1334 ; CHECK-NEXT: sqdmull.4s v0, v0, v1[1]
1336 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1337 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1341 define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1342 ; CHECK-LABEL: sqdmull_lane_2d:
1344 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1345 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[1]
1347 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1348 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1352 define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind {
1353 ; CHECK-LABEL: sqdmull2_lane_4s:
1355 ; CHECK-NEXT: sqdmull2.4s v0, v0, v1[1]
1357 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1358 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1359 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1363 define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind {
1364 ; CHECK-LABEL: sqdmull2_lane_2d:
1366 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1[1]
1368 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1369 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1370 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1374 define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1375 ; CHECK-LABEL: umull_lane_4s:
1377 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1378 ; CHECK-NEXT: umull.4s v0, v0, v1[1]
1380 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1381 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1385 define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1386 ; CHECK-LABEL: umull_lane_2d:
1388 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1389 ; CHECK-NEXT: umull.2d v0, v0, v1[1]
1391 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1392 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1396 define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1397 ; CHECK-LABEL: smull_lane_4s:
1399 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1400 ; CHECK-NEXT: smull.4s v0, v0, v1[1]
1402 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1403 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1407 define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1408 ; CHECK-LABEL: smull_lane_2d:
1410 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1411 ; CHECK-NEXT: smull.2d v0, v0, v1[1]
1413 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1414 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1418 define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1419 ; CHECK-LABEL: smlal_lane_4s:
1421 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1422 ; CHECK-NEXT: smlal.4s v2, v0, v1[1]
1423 ; CHECK-NEXT: mov.16b v0, v2
1425 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1426 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1427 %tmp6 = add <4 x i32> %C, %tmp5
1431 define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1432 ; CHECK-LABEL: smlal_lane_2d:
1434 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1435 ; CHECK-NEXT: smlal.2d v2, v0, v1[1]
1436 ; CHECK-NEXT: mov.16b v0, v2
1438 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1439 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1440 %tmp6 = add <2 x i64> %C, %tmp5
1444 define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1445 ; CHECK-LABEL: sqdmlal_lane_4s:
1447 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1448 ; CHECK-NEXT: sqdmlal.4s v2, v0, v1[1]
1449 ; CHECK-NEXT: mov.16b v0, v2
1451 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1452 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1453 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1457 define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1458 ; CHECK-LABEL: sqdmlal_lane_2d:
1460 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1461 ; CHECK-NEXT: sqdmlal.2d v2, v0, v1[1]
1462 ; CHECK-NEXT: mov.16b v0, v2
1464 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1465 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1466 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1470 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
1471 ; CHECK-LABEL: sqdmlal2_lane_4s:
1473 ; CHECK-NEXT: sqdmlal2.4s v2, v0, v1[1]
1474 ; CHECK-NEXT: mov.16b v0, v2
1476 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1477 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1478 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1479 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1483 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
1484 ; CHECK-LABEL: sqdmlal2_lane_2d:
1486 ; CHECK-NEXT: sqdmlal2.2d v2, v0, v1[1]
1487 ; CHECK-NEXT: mov.16b v0, v2
1489 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1490 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1491 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1492 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1496 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1497 ; CHECK-LABEL: sqdmlal_lane_1s:
1499 ; CHECK-NEXT: fmov s1, w1
1500 ; CHECK-NEXT: fmov s2, w0
1501 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1502 ; CHECK-NEXT: sqdmlal.h s2, h1, v0[1]
1503 ; CHECK-NEXT: fmov w0, s2
1505 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1506 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1507 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1508 %prod = extractelement <4 x i32> %prod.vec, i32 0
1509 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1512 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1514 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1515 ; CHECK-LABEL: sqdmlsl_lane_1s:
1517 ; CHECK-NEXT: fmov s1, w1
1518 ; CHECK-NEXT: fmov s2, w0
1519 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1520 ; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1]
1521 ; CHECK-NEXT: fmov w0, s2
1523 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1524 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1525 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1526 %prod = extractelement <4 x i32> %prod.vec, i32 0
1527 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1530 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1532 define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1533 ; CHECK-LABEL: sqadd_lane1_sqdmull4s:
1535 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
1536 ; CHECK-NEXT: mov.s w8, v0[1]
1537 ; CHECK-NEXT: fmov s0, w0
1538 ; CHECK-NEXT: fmov s1, w8
1539 ; CHECK-NEXT: sqadd s0, s0, s1
1540 ; CHECK-NEXT: fmov w0, s0
1542 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1543 %prod = extractelement <4 x i32> %prod.vec, i32 1
1544 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1548 define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1549 ; CHECK-LABEL: sqsub_lane1_sqdmull4s:
1551 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
1552 ; CHECK-NEXT: mov.s w8, v0[1]
1553 ; CHECK-NEXT: fmov s0, w0
1554 ; CHECK-NEXT: fmov s1, w8
1555 ; CHECK-NEXT: sqsub s0, s0, s1
1556 ; CHECK-NEXT: fmov w0, s0
1558 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1559 %prod = extractelement <4 x i32> %prod.vec, i32 1
1560 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1564 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1565 ; CHECK-LABEL: sqdmlal_lane_1d:
1567 ; CHECK-NEXT: fmov d1, x0
1568 ; CHECK-NEXT: fmov s2, w1
1569 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1570 ; CHECK-NEXT: sqdmlal.s d1, s2, v0[1]
1571 ; CHECK-NEXT: fmov x0, d1
1573 %rhs = extractelement <2 x i32> %C, i32 1
1574 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1575 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1578 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1579 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1581 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1582 ; CHECK-LABEL: sqdmlsl_lane_1d:
1584 ; CHECK-NEXT: fmov d1, x0
1585 ; CHECK-NEXT: fmov s2, w1
1586 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1587 ; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1]
1588 ; CHECK-NEXT: fmov x0, d1
1590 %rhs = extractelement <2 x i32> %C, i32 1
1591 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1592 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1595 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1598 define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1599 ; CHECK-LABEL: umlal_lane_4s:
1601 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1602 ; CHECK-NEXT: umlal.4s v2, v0, v1[1]
1603 ; CHECK-NEXT: mov.16b v0, v2
1605 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1606 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1607 %tmp6 = add <4 x i32> %C, %tmp5
1611 define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1612 ; CHECK-LABEL: umlal_lane_2d:
1614 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1615 ; CHECK-NEXT: umlal.2d v2, v0, v1[1]
1616 ; CHECK-NEXT: mov.16b v0, v2
1618 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1619 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1620 %tmp6 = add <2 x i64> %C, %tmp5
1625 define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1626 ; CHECK-LABEL: smlsl_lane_4s:
1628 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1629 ; CHECK-NEXT: smlsl.4s v2, v0, v1[1]
1630 ; CHECK-NEXT: mov.16b v0, v2
1632 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1633 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1634 %tmp6 = sub <4 x i32> %C, %tmp5
1638 define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1639 ; CHECK-LABEL: smlsl_lane_2d:
1641 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1642 ; CHECK-NEXT: smlsl.2d v2, v0, v1[1]
1643 ; CHECK-NEXT: mov.16b v0, v2
1645 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1646 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1647 %tmp6 = sub <2 x i64> %C, %tmp5
1651 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1652 ; CHECK-LABEL: sqdmlsl_lane_4s:
1654 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1655 ; CHECK-NEXT: sqdmlsl.4s v2, v0, v1[1]
1656 ; CHECK-NEXT: mov.16b v0, v2
1658 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1659 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1660 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1664 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1665 ; CHECK-LABEL: sqdmlsl_lane_2d:
1667 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1668 ; CHECK-NEXT: sqdmlsl.2d v2, v0, v1[1]
1669 ; CHECK-NEXT: mov.16b v0, v2
1671 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1672 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1673 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1677 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
1678 ; CHECK-LABEL: sqdmlsl2_lane_4s:
1680 ; CHECK-NEXT: sqdmlsl2.4s v2, v0, v1[1]
1681 ; CHECK-NEXT: mov.16b v0, v2
1683 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1684 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1685 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1686 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1690 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
1691 ; CHECK-LABEL: sqdmlsl2_lane_2d:
1693 ; CHECK-NEXT: sqdmlsl2.2d v2, v0, v1[1]
1694 ; CHECK-NEXT: mov.16b v0, v2
1696 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1697 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1698 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1699 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1703 define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1704 ; CHECK-LABEL: umlsl_lane_4s:
1706 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1707 ; CHECK-NEXT: umlsl.4s v2, v0, v1[1]
1708 ; CHECK-NEXT: mov.16b v0, v2
1710 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1711 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1712 %tmp6 = sub <4 x i32> %C, %tmp5
1716 define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1717 ; CHECK-LABEL: umlsl_lane_2d:
1719 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1720 ; CHECK-NEXT: umlsl.2d v2, v0, v1[1]
1721 ; CHECK-NEXT: mov.16b v0, v2
1723 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1724 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1725 %tmp6 = sub <2 x i64> %C, %tmp5
1730 define float @fmulxs(float %a, float %b) nounwind {
1731 ; CHECK-LABEL: fmulxs:
1733 ; CHECK-NEXT: fmulx s0, s0, s1
1735 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1739 define double @fmulxd(double %a, double %b) nounwind {
1740 ; CHECK-LABEL: fmulxd:
1742 ; CHECK-NEXT: fmulx d0, d0, d1
1744 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1748 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1749 ; CHECK-LABEL: fmulxs_lane:
1751 ; CHECK-NEXT: fmulx.s s0, s0, v1[3]
1753 %b = extractelement <4 x float> %vec, i32 3
1754 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1758 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1759 ; CHECK-LABEL: fmulxd_lane:
1761 ; CHECK-NEXT: fmulx.d d0, d0, v1[1]
1763 %b = extractelement <2 x double> %vec, i32 1
1764 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1768 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1769 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1772 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1773 ; CHECK-LABEL: smull2_8h_simple:
1775 ; CHECK-NEXT: smull2.8h v0, v0, v1
1777 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1778 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1779 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1783 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1784 ; CHECK-LABEL: foo0:
1786 ; CHECK-NEXT: smull2.8h v0, v0, v1
1788 %tmp = bitcast <16 x i8> %a to <2 x i64>
1789 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1790 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1791 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1792 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1793 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1794 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1795 ret <8 x i16> %vmull.i.i
1798 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1799 ; CHECK-LABEL: foo1:
1801 ; CHECK-NEXT: smull2.4s v0, v0, v1
1803 %tmp = bitcast <8 x i16> %a to <2 x i64>
1804 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1805 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1806 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1807 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1808 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1809 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1810 ret <4 x i32> %vmull2.i.i
1813 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1814 ; CHECK-LABEL: foo2:
1816 ; CHECK-NEXT: smull2.2d v0, v0, v1
1818 %tmp = bitcast <4 x i32> %a to <2 x i64>
1819 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1820 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1821 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1822 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1823 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1824 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1825 ret <2 x i64> %vmull2.i.i
1828 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1829 ; CHECK-LABEL: foo3:
1831 ; CHECK-NEXT: umull2.8h v0, v0, v1
1833 %tmp = bitcast <16 x i8> %a to <2 x i64>
1834 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1835 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1836 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1837 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1838 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1839 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1840 ret <8 x i16> %vmull.i.i
1843 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1844 ; CHECK-LABEL: foo4:
1846 ; CHECK-NEXT: umull2.4s v0, v0, v1
1848 %tmp = bitcast <8 x i16> %a to <2 x i64>
1849 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1850 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1851 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1852 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1853 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1854 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1855 ret <4 x i32> %vmull2.i.i
1858 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1859 ; CHECK-LABEL: foo5:
1861 ; CHECK-NEXT: umull2.2d v0, v0, v1
1863 %tmp = bitcast <4 x i32> %a to <2 x i64>
1864 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1865 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1866 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1867 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1868 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1869 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1870 ret <2 x i64> %vmull2.i.i
1873 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1874 ; CHECK-LABEL: foo6:
1875 ; CHECK: // %bb.0: // %entry
1876 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1877 ; CHECK-NEXT: smull2.4s v0, v1, v2[1]
1880 %0 = bitcast <8 x i16> %b to <2 x i64>
1881 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1882 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1883 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1884 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1885 ret <4 x i32> %vmull2.i
1888 define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1889 ; CHECK-LABEL: foo6a:
1890 ; CHECK: // %bb.0: // %entry
1891 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1892 ; CHECK-NEXT: smull.4s v0, v1, v2[1]
1895 %0 = bitcast <8 x i16> %b to <2 x i64>
1896 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1897 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1898 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1899 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1900 ret <4 x i32> %vmull2.i
1903 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1904 ; CHECK-LABEL: foo7:
1905 ; CHECK: // %bb.0: // %entry
1906 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1907 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
1910 %0 = bitcast <4 x i32> %b to <2 x i64>
1911 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1912 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1913 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1914 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1915 ret <2 x i64> %vmull2.i
1918 define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1919 ; CHECK-LABEL: foo7a:
1920 ; CHECK: // %bb.0: // %entry
1921 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1922 ; CHECK-NEXT: smull.2d v0, v1, v2[1]
1925 %0 = bitcast <4 x i32> %b to <2 x i64>
1926 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1927 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1928 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1929 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1930 ret <2 x i64> %vmull2.i
1934 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1935 ; CHECK-LABEL: foo8:
1936 ; CHECK: // %bb.0: // %entry
1937 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1938 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
1941 %0 = bitcast <8 x i16> %b to <2 x i64>
1942 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1943 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1944 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1945 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1946 ret <4 x i32> %vmull2.i
1949 define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1950 ; CHECK-LABEL: foo8a:
1951 ; CHECK: // %bb.0: // %entry
1952 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1953 ; CHECK-NEXT: umull.4s v0, v1, v2[1]
1956 %0 = bitcast <8 x i16> %b to <2 x i64>
1957 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1958 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1959 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1960 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1961 ret <4 x i32> %vmull2.i
1964 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1965 ; CHECK-LABEL: foo9:
1966 ; CHECK: // %bb.0: // %entry
1967 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1968 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
1971 %0 = bitcast <4 x i32> %b to <2 x i64>
1972 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1973 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1974 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1975 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1976 ret <2 x i64> %vmull2.i
1979 define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1980 ; CHECK-LABEL: foo9a:
1981 ; CHECK: // %bb.0: // %entry
1982 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1983 ; CHECK-NEXT: umull.2d v0, v1, v2[1]
1986 %0 = bitcast <4 x i32> %b to <2 x i64>
1987 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1988 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1989 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1990 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1991 ret <2 x i64> %vmull2.i
1994 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1995 ; CHECK-LABEL: bar0:
1997 ; CHECK-NEXT: smlal2.8h v0, v1, v2
1999 %tmp = bitcast <16 x i8> %b to <2 x i64>
2000 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2001 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
2002 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
2003 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2004 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
2005 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
2006 %add.i = add <8 x i16> %vmull.i.i.i, %a
2007 ret <8 x i16> %add.i
2010 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
2011 ; CHECK-LABEL: bar1:
2013 ; CHECK-NEXT: smlal2.4s v0, v1, v2
2015 %tmp = bitcast <8 x i16> %b to <2 x i64>
2016 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2017 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
2018 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
2019 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2020 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
2021 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2022 %add.i = add <4 x i32> %vmull2.i.i.i, %a
2023 ret <4 x i32> %add.i
2026 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
2027 ; CHECK-LABEL: bar2:
2029 ; CHECK-NEXT: smlal2.2d v0, v1, v2
2031 %tmp = bitcast <4 x i32> %b to <2 x i64>
2032 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2033 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
2034 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
2035 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2036 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
2037 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2038 %add.i = add <2 x i64> %vmull2.i.i.i, %a
2039 ret <2 x i64> %add.i
2042 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
2043 ; CHECK-LABEL: bar3:
2045 ; CHECK-NEXT: umlal2.8h v0, v1, v2
2047 %tmp = bitcast <16 x i8> %b to <2 x i64>
2048 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2049 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
2050 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
2051 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2052 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
2053 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
2054 %add.i = add <8 x i16> %vmull.i.i.i, %a
2055 ret <8 x i16> %add.i
2058 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
2059 ; CHECK-LABEL: bar4:
2061 ; CHECK-NEXT: umlal2.4s v0, v1, v2
2063 %tmp = bitcast <8 x i16> %b to <2 x i64>
2064 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2065 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
2066 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
2067 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2068 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
2069 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2070 %add.i = add <4 x i32> %vmull2.i.i.i, %a
2071 ret <4 x i32> %add.i
2074 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
2075 ; CHECK-LABEL: bar5:
2077 ; CHECK-NEXT: umlal2.2d v0, v1, v2
2079 %tmp = bitcast <4 x i32> %b to <2 x i64>
2080 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2081 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
2082 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
2083 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2084 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
2085 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2086 %add.i = add <2 x i64> %vmull2.i.i.i, %a
2087 ret <2 x i64> %add.i
2090 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
2091 ; CHECK-LABEL: mlal2_1:
2093 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2094 ; CHECK-NEXT: smlal2.4s v0, v1, v2[3]
2096 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2097 %tmp = bitcast <8 x i16> %b to <2 x i64>
2098 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2099 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2100 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
2101 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2102 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
2103 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2104 %add = add <4 x i32> %vmull2.i.i, %a
2108 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2109 ; CHECK-LABEL: mlal2_2:
2111 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2112 ; CHECK-NEXT: smlal2.2d v0, v1, v2[1]
2114 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2115 %tmp = bitcast <4 x i32> %b to <2 x i64>
2116 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2117 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2118 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2119 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2120 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2121 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2122 %add = add <2 x i64> %vmull2.i.i, %a
2126 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
2127 ; CHECK-LABEL: mlal2_4:
2129 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2130 ; CHECK-NEXT: umlal2.4s v0, v1, v2[2]
2132 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2133 %tmp = bitcast <8 x i16> %b to <2 x i64>
2134 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2135 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2136 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
2137 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2138 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
2139 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2140 %add = add <4 x i32> %vmull2.i.i, %a
2144 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2145 ; CHECK-LABEL: mlal2_5:
2147 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2148 ; CHECK-NEXT: umlal2.2d v0, v1, v2[0]
2150 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
2151 %tmp = bitcast <4 x i32> %b to <2 x i64>
2152 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2153 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2154 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2155 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2156 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2157 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2158 %add = add <2 x i64> %vmull2.i.i, %a
2163 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
2164 ; CHECK-LABEL: vmulq_n_f64:
2165 ; CHECK: // %bb.0: // %entry
2166 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2167 ; CHECK-NEXT: fmul.2d v0, v0, v1[0]
2170 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
2171 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
2172 %mul.i = fmul <2 x double> %vecinit1.i, %x
2173 ret <2 x double> %mul.i
2176 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
2177 ; CHECK-LABEL: vmulq_n_f32:
2178 ; CHECK: // %bb.0: // %entry
2179 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
2180 ; CHECK-NEXT: fmul.4s v0, v0, v1[0]
2183 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
2184 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
2185 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
2186 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
2187 %mul.i = fmul <4 x float> %vecinit3.i, %x
2188 ret <4 x float> %mul.i
2191 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
2192 ; CHECK-LABEL: vmul_n_f32:
2193 ; CHECK: // %bb.0: // %entry
2194 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
2195 ; CHECK-NEXT: fmul.2s v0, v0, v1[0]
2198 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
2199 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
2200 %mul.i = fmul <2 x float> %vecinit1.i, %x
2201 ret <2 x float> %mul.i
2204 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2205 ; CHECK-LABEL: vmla_laneq_s16_test:
2206 ; CHECK: // %bb.0: // %entry
2207 ; CHECK-NEXT: mla.4h v0, v1, v2[6]
2210 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2211 %mul = mul <4 x i16> %shuffle, %b
2212 %add = add <4 x i16> %mul, %a
2216 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2217 ; CHECK-LABEL: vmla_laneq_s32_test:
2218 ; CHECK: // %bb.0: // %entry
2219 ; CHECK-NEXT: mla.2s v0, v1, v2[3]
2222 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
2223 %mul = mul <2 x i32> %shuffle, %b
2224 %add = add <2 x i32> %mul, %a
2228 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2229 ; CHECK-LABEL: not_really_vmlaq_laneq_s16_test:
2230 ; CHECK: // %bb.0: // %entry
2231 ; CHECK-NEXT: mla.8h v0, v1, v2[5]
2234 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2235 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2236 %mul = mul <8 x i16> %shuffle2, %b
2237 %add = add <8 x i16> %mul, %a
2241 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2242 ; CHECK-LABEL: not_really_vmlaq_laneq_s32_test:
2243 ; CHECK: // %bb.0: // %entry
2244 ; CHECK-NEXT: mla.4s v0, v1, v2[3]
2247 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2248 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2249 %mul = mul <4 x i32> %shuffle2, %b
2250 %add = add <4 x i32> %mul, %a
2254 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2255 ; CHECK-LABEL: vmull_laneq_s16_test:
2256 ; CHECK: // %bb.0: // %entry
2257 ; CHECK-NEXT: smull.4s v0, v0, v1[6]
2260 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2261 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2262 ret <4 x i32> %vmull2.i
2265 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2266 ; CHECK-LABEL: vmull_laneq_s32_test:
2267 ; CHECK: // %bb.0: // %entry
2268 ; CHECK-NEXT: smull.2d v0, v0, v1[2]
2271 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2272 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2273 ret <2 x i64> %vmull2.i
2275 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2276 ; CHECK-LABEL: vmull_laneq_u16_test:
2277 ; CHECK: // %bb.0: // %entry
2278 ; CHECK-NEXT: umull.4s v0, v0, v1[6]
2281 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2282 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2283 ret <4 x i32> %vmull2.i
2286 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2287 ; CHECK-LABEL: vmull_laneq_u32_test:
2288 ; CHECK: // %bb.0: // %entry
2289 ; CHECK-NEXT: umull.2d v0, v0, v1[2]
2292 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2293 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2294 ret <2 x i64> %vmull2.i
2297 define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2298 ; CHECK-LABEL: vmull_low_n_s16_test:
2299 ; CHECK: // %bb.0: // %entry
2300 ; CHECK-NEXT: dup.4h v0, w0
2301 ; CHECK-NEXT: smull.4s v0, v1, v0
2304 %conv = trunc i32 %d to i16
2305 %0 = bitcast <8 x i16> %b to <2 x i64>
2306 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
2307 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2308 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2309 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2310 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2311 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2312 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2313 ret <4 x i32> %vmull2.i.i
2316 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2317 ; CHECK-LABEL: vmull_high_n_s16_test:
2318 ; CHECK: // %bb.0: // %entry
2319 ; CHECK-NEXT: dup.8h v0, w0
2320 ; CHECK-NEXT: smull2.4s v0, v1, v0
2323 %conv = trunc i32 %d to i16
2324 %0 = bitcast <8 x i16> %b to <2 x i64>
2325 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2326 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2327 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2328 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2329 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2330 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2331 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2332 ret <4 x i32> %vmull2.i.i
2335 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2336 ; CHECK-LABEL: vmull_high_n_s32_test:
2337 ; CHECK: // %bb.0: // %entry
2338 ; CHECK-NEXT: dup.4s v0, w0
2339 ; CHECK-NEXT: smull2.2d v0, v1, v0
2342 %0 = bitcast <4 x i32> %b to <2 x i64>
2343 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2344 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2345 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2346 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2347 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2348 ret <2 x i64> %vmull2.i.i
2351 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2352 ; CHECK-LABEL: vmull_high_n_u16_test:
2353 ; CHECK: // %bb.0: // %entry
2354 ; CHECK-NEXT: dup.8h v0, w0
2355 ; CHECK-NEXT: umull2.4s v0, v1, v0
2358 %conv = trunc i32 %d to i16
2359 %0 = bitcast <8 x i16> %b to <2 x i64>
2360 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2361 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2362 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2363 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2364 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2365 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2366 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2367 ret <4 x i32> %vmull2.i.i
2370 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2371 ; CHECK-LABEL: vmull_high_n_u32_test:
2372 ; CHECK: // %bb.0: // %entry
2373 ; CHECK-NEXT: dup.4s v0, w0
2374 ; CHECK-NEXT: umull2.2d v0, v1, v0
2377 %0 = bitcast <4 x i32> %b to <2 x i64>
2378 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2379 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2380 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2381 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2382 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2383 ret <2 x i64> %vmull2.i.i
2386 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
2387 ; CHECK-LABEL: vmul_built_dup_test:
2389 ; CHECK-NEXT: mul.4s v0, v0, v1[1]
2391 %vget_lane = extractelement <4 x i32> %b, i32 1
2392 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
2393 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
2394 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
2395 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
2396 %prod = mul <4 x i32> %a, %vecinit3.i
2400 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
2401 ; CHECK-LABEL: vmul_built_dup_fromsmall_test:
2403 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2404 ; CHECK-NEXT: mul.4h v0, v0, v1[3]
2406 %vget_lane = extractelement <4 x i16> %b, i32 3
2407 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
2408 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
2409 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2410 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2411 %prod = mul <4 x i16> %a, %vecinit3.i
2415 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
2416 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
2418 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2419 ; CHECK-NEXT: mul.8h v0, v0, v1[0]
2421 %vget_lane = extractelement <4 x i16> %b, i32 0
2422 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
2423 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
2424 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2425 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2426 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
2427 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
2428 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
2429 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
2430 %prod = mul <8 x i16> %a, %vecinit7.i
2434 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
2435 ; CHECK-LABEL: mull_from_two_extracts:
2437 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1
2439 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2440 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2442 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2446 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2447 ; CHECK-LABEL: mlal_from_two_extracts:
2449 ; CHECK-NEXT: sqdmlal2.2d v0, v1, v2
2451 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2452 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2454 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2455 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2459 define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
2460 ; CHECK-LABEL: mull_from_extract_dup_low:
2462 ; CHECK-NEXT: dup.2s v1, w0
2463 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
2465 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2466 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2468 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2470 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2474 define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
2475 ; CHECK-LABEL: mull_from_extract_dup_high:
2477 ; CHECK-NEXT: dup.4s v1, w0
2478 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1
2480 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2481 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2483 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2485 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2489 define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
2490 ; CHECK-LABEL: pmull_from_extract_dup_low:
2492 ; CHECK-NEXT: dup.8b v1, w0
2493 ; CHECK-NEXT: pmull.8h v0, v0, v1
2495 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2496 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2498 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2500 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2504 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
2505 ; CHECK-LABEL: pmull_from_extract_dup_high:
2507 ; CHECK-NEXT: dup.16b v1, w0
2508 ; CHECK-NEXT: pmull2.8h v0, v0, v1
2510 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2511 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2513 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2515 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2519 define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
2520 ; CHECK-LABEL: pmull_from_extract_duplane_low:
2522 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2523 ; CHECK-NEXT: dup.8b v1, v1[0]
2524 ; CHECK-NEXT: pmull.8h v0, v0, v1
2526 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2527 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2529 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2533 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
2534 ; CHECK-LABEL: pmull_from_extract_duplane_high:
2536 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2537 ; CHECK-NEXT: dup.16b v1, v1[0]
2538 ; CHECK-NEXT: pmull2.8h v0, v0, v1
2540 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2541 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2543 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2547 define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
2548 ; CHECK-LABEL: sqdmull_from_extract_duplane_low:
2550 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[0]
2552 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2553 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2555 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2559 define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
2560 ; CHECK-LABEL: sqdmull_from_extract_duplane_high:
2562 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1[0]
2564 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2565 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2567 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2571 define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2572 ; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
2574 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2[0]
2576 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2577 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2579 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2580 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2584 define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2585 ; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
2587 ; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[0]
2589 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2590 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2592 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2593 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2597 define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2598 ; CHECK-LABEL: umlal_from_extract_duplane_low:
2600 ; CHECK-NEXT: umlal.2d v0, v1, v2[0]
2602 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2603 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2605 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2606 %sum = add <2 x i64> %accum, %res
2610 define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2611 ; CHECK-LABEL: umlal_from_extract_duplane_high:
2613 ; CHECK-NEXT: umlal2.2d v0, v1, v2[0]
2615 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2616 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2618 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2619 %sum = add <2 x i64> %accum, %res
2623 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2624 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
2626 ; CHECK-NEXT: fmla.s s0, s1, v2[3]
2628 %rhs = extractelement <4 x float> %rvec, i32 3
2629 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2633 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2634 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
2636 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2637 ; CHECK-NEXT: fmla.s s0, s1, v2[1]
2639 %rhs = extractelement <2 x float> %rvec, i32 1
2640 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2644 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2645 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
2647 ; CHECK-NEXT: fmls.s s0, s1, v2[3]
2649 %rhs.scal = extractelement <4 x float> %rvec, i32 3
2650 %rhs = fsub float -0.0, %rhs.scal
2651 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2655 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2656 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
2658 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2659 ; CHECK-NEXT: fmls.s s0, s1, v2[1]
2661 %rhs.scal = extractelement <2 x float> %rvec, i32 1
2662 %rhs = fsub float -0.0, %rhs.scal
2663 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2667 declare float @llvm.fma.f32(float, float, float)
2669 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2670 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
2672 ; CHECK-NEXT: fmla.d d0, d1, v2[1]
2674 %rhs = extractelement <2 x double> %rvec, i32 1
2675 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2679 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2680 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
2682 ; CHECK-NEXT: fmls.d d0, d1, v2[1]
2684 %rhs.scal = extractelement <2 x double> %rvec, i32 1
2685 %rhs = fsub double -0.0, %rhs.scal
2686 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2690 declare double @llvm.fma.f64(double, double, double)
2692 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
2693 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
2695 ; CHECK-NEXT: fmls.2s v0, v1, v2[3]
2697 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2698 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2699 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2700 ret <2 x float> %res
2703 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
2704 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
2706 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2707 ; CHECK-NEXT: fmls.2s v0, v1, v2[1]
2709 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2710 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2711 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2712 ret <2 x float> %res
2715 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
2716 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
2718 ; CHECK-NEXT: fmls.4s v0, v1, v2[3]
2720 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2721 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2722 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2723 ret <4 x float> %res
2726 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
2727 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
2729 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2730 ; CHECK-NEXT: fmls.4s v0, v1, v2[1]
2732 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2733 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2734 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2735 ret <4 x float> %res
2738 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
2739 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
2741 ; CHECK-NEXT: fmls.2d v0, v1, v2[1]
2743 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
2744 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2745 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
2746 ret <2 x double> %res
2749 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2750 ; CHECK-LABEL: test_fmul_v1f64:
2752 ; CHECK-NEXT: fmul d0, d0, d1
2754 %prod = fmul <1 x double> %L, %R
2755 ret <1 x double> %prod
2758 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2759 ; CHECK-LABEL: test_fdiv_v1f64:
2761 ; CHECK-NEXT: fdiv d0, d0, d1
2763 %prod = fdiv <1 x double> %L, %R
2764 ret <1 x double> %prod
2767 define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
2768 ; CHECK-LABEL: sqdmlal_s:
2770 ; CHECK-NEXT: fmov s0, w0
2771 ; CHECK-NEXT: fmov s1, w1
2772 ; CHECK-NEXT: fmov s2, w2
2773 ; CHECK-NEXT: sqdmlal.h s2, h0, v1[0]
2774 ; CHECK-NEXT: fmov w0, s2
2776 %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2777 %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2778 %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2779 %tmp4 = extractelement <4 x i32> %tmp3, i64 0
2780 %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4)
2784 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2785 ; CHECK-LABEL: sqdmlal_d:
2787 ; CHECK-NEXT: fmov d0, x2
2788 ; CHECK-NEXT: fmov s1, w0
2789 ; CHECK-NEXT: fmov s2, w1
2790 ; CHECK-NEXT: sqdmlal d0, s1, s2
2791 ; CHECK-NEXT: fmov x0, d0
2793 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2794 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2798 define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
2799 ; CHECK-LABEL: sqdmlsl_s:
2801 ; CHECK-NEXT: fmov s0, w0
2802 ; CHECK-NEXT: fmov s1, w1
2803 ; CHECK-NEXT: fmov s2, w2
2804 ; CHECK-NEXT: sqdmlsl.h s2, h0, v1[0]
2805 ; CHECK-NEXT: fmov w0, s2
2807 %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2808 %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2809 %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2810 %tmp4 = extractelement <4 x i32> %tmp3, i64 0
2811 %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4)
2815 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2816 ; CHECK-LABEL: sqdmlsl_d:
2818 ; CHECK-NEXT: fmov d0, x2
2819 ; CHECK-NEXT: fmov s1, w0
2820 ; CHECK-NEXT: fmov s2, w1
2821 ; CHECK-NEXT: sqdmlsl d0, s1, s2
2822 ; CHECK-NEXT: fmov x0, d0
2824 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2825 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2829 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2830 ; CHECK-LABEL: test_pmull_64:
2832 ; CHECK-NEXT: fmov d0, x1
2833 ; CHECK-NEXT: fmov d1, x0
2834 ; CHECK-NEXT: pmull.1q v0, v1, v0
2836 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2840 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2841 ; CHECK-LABEL: test_pmull_high_64:
2843 ; CHECK-NEXT: pmull2.1q v0, v0, v1
2845 %l_hi = extractelement <2 x i64> %l, i32 1
2846 %r_hi = extractelement <2 x i64> %r, i32 1
2847 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2851 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2853 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2854 ; CHECK-LABEL: test_mul_v1i64:
2856 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2857 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
2858 ; CHECK-NEXT: fmov x8, d1
2859 ; CHECK-NEXT: fmov x9, d0
2860 ; CHECK-NEXT: mul x8, x9, x8
2861 ; CHECK-NEXT: fmov d0, x8
2863 %prod = mul <1 x i64> %lhs, %rhs