1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 ; CHECK-LABEL: smull8h:
7 ; CHECK-NEXT: ldr d0, [x0]
8 ; CHECK-NEXT: ldr d1, [x1]
9 ; CHECK-NEXT: smull.8h v0, v0, v1
11 %tmp1 = load <8 x i8>, <8 x i8>* %A
12 %tmp2 = load <8 x i8>, <8 x i8>* %B
13 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
17 define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
18 ; CHECK-LABEL: smull4s:
20 ; CHECK-NEXT: ldr d0, [x0]
21 ; CHECK-NEXT: ldr d1, [x1]
22 ; CHECK-NEXT: smull.4s v0, v0, v1
24 %tmp1 = load <4 x i16>, <4 x i16>* %A
25 %tmp2 = load <4 x i16>, <4 x i16>* %B
26 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
30 define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
31 ; CHECK-LABEL: smull2d:
33 ; CHECK-NEXT: ldr d0, [x0]
34 ; CHECK-NEXT: ldr d1, [x1]
35 ; CHECK-NEXT: smull.2d v0, v0, v1
37 %tmp1 = load <2 x i32>, <2 x i32>* %A
38 %tmp2 = load <2 x i32>, <2 x i32>* %B
39 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
43 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
44 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
45 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
47 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
48 ; CHECK-LABEL: umull8h:
50 ; CHECK-NEXT: ldr d0, [x0]
51 ; CHECK-NEXT: ldr d1, [x1]
52 ; CHECK-NEXT: umull.8h v0, v0, v1
54 %tmp1 = load <8 x i8>, <8 x i8>* %A
55 %tmp2 = load <8 x i8>, <8 x i8>* %B
56 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
60 define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
61 ; CHECK-LABEL: umull4s:
63 ; CHECK-NEXT: ldr d0, [x0]
64 ; CHECK-NEXT: ldr d1, [x1]
65 ; CHECK-NEXT: umull.4s v0, v0, v1
67 %tmp1 = load <4 x i16>, <4 x i16>* %A
68 %tmp2 = load <4 x i16>, <4 x i16>* %B
69 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
73 define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
74 ; CHECK-LABEL: umull2d:
76 ; CHECK-NEXT: ldr d0, [x0]
77 ; CHECK-NEXT: ldr d1, [x1]
78 ; CHECK-NEXT: umull.2d v0, v0, v1
80 %tmp1 = load <2 x i32>, <2 x i32>* %A
81 %tmp2 = load <2 x i32>, <2 x i32>* %B
82 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
86 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
87 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
88 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
90 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
91 ; CHECK-LABEL: sqdmull4s:
93 ; CHECK-NEXT: ldr d0, [x0]
94 ; CHECK-NEXT: ldr d1, [x1]
95 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
97 %tmp1 = load <4 x i16>, <4 x i16>* %A
98 %tmp2 = load <4 x i16>, <4 x i16>* %B
99 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
103 define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
104 ; CHECK-LABEL: sqdmull2d:
106 ; CHECK-NEXT: ldr d0, [x0]
107 ; CHECK-NEXT: ldr d1, [x1]
108 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
110 %tmp1 = load <2 x i32>, <2 x i32>* %A
111 %tmp2 = load <2 x i32>, <2 x i32>* %B
112 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
116 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
117 ; CHECK-LABEL: sqdmull2_4s:
119 ; CHECK-NEXT: ldr d0, [x0, #8]
120 ; CHECK-NEXT: ldr d1, [x1, #8]
121 ; CHECK-NEXT: sqdmull.4s v0, v0, v1
123 %load1 = load <8 x i16>, <8 x i16>* %A
124 %load2 = load <8 x i16>, <8 x i16>* %B
125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
131 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
132 ; CHECK-LABEL: sqdmull2_2d:
134 ; CHECK-NEXT: ldr d0, [x0, #8]
135 ; CHECK-NEXT: ldr d1, [x1, #8]
136 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
138 %load1 = load <4 x i32>, <4 x i32>* %A
139 %load2 = load <4 x i32>, <4 x i32>* %B
140 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
141 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
142 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
147 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
148 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
150 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
151 ; CHECK-LABEL: pmull8h:
153 ; CHECK-NEXT: ldr d0, [x0]
154 ; CHECK-NEXT: ldr d1, [x1]
155 ; CHECK-NEXT: pmull.8h v0, v0, v1
157 %tmp1 = load <8 x i8>, <8 x i8>* %A
158 %tmp2 = load <8 x i8>, <8 x i8>* %B
159 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
163 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
165 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
166 ; CHECK-LABEL: sqdmulh_4h:
168 ; CHECK-NEXT: ldr d0, [x0]
169 ; CHECK-NEXT: ldr d1, [x1]
170 ; CHECK-NEXT: sqdmulh.4h v0, v0, v1
172 %tmp1 = load <4 x i16>, <4 x i16>* %A
173 %tmp2 = load <4 x i16>, <4 x i16>* %B
174 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
178 define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
179 ; CHECK-LABEL: sqdmulh_8h:
181 ; CHECK-NEXT: ldr q0, [x0]
182 ; CHECK-NEXT: ldr q1, [x1]
183 ; CHECK-NEXT: sqdmulh.8h v0, v0, v1
185 %tmp1 = load <8 x i16>, <8 x i16>* %A
186 %tmp2 = load <8 x i16>, <8 x i16>* %B
187 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
191 define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
192 ; CHECK-LABEL: sqdmulh_2s:
194 ; CHECK-NEXT: ldr d0, [x0]
195 ; CHECK-NEXT: ldr d1, [x1]
196 ; CHECK-NEXT: sqdmulh.2s v0, v0, v1
198 %tmp1 = load <2 x i32>, <2 x i32>* %A
199 %tmp2 = load <2 x i32>, <2 x i32>* %B
200 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
204 define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
205 ; CHECK-LABEL: sqdmulh_4s:
207 ; CHECK-NEXT: ldr q0, [x0]
208 ; CHECK-NEXT: ldr q1, [x1]
209 ; CHECK-NEXT: sqdmulh.4s v0, v0, v1
211 %tmp1 = load <4 x i32>, <4 x i32>* %A
212 %tmp2 = load <4 x i32>, <4 x i32>* %B
213 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
217 define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
218 ; CHECK-LABEL: sqdmulh_1s:
220 ; CHECK-NEXT: ldr w8, [x0]
221 ; CHECK-NEXT: ldr w9, [x1]
222 ; CHECK-NEXT: fmov s0, w8
223 ; CHECK-NEXT: fmov s1, w9
224 ; CHECK-NEXT: sqdmulh s0, s0, s1
225 ; CHECK-NEXT: fmov w0, s0
227 %tmp1 = load i32, i32* %A
228 %tmp2 = load i32, i32* %B
229 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
233 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
234 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
235 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
236 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
237 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
239 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
240 ; CHECK-LABEL: sqrdmulh_4h:
242 ; CHECK-NEXT: ldr d0, [x0]
243 ; CHECK-NEXT: ldr d1, [x1]
244 ; CHECK-NEXT: sqrdmulh.4h v0, v0, v1
246 %tmp1 = load <4 x i16>, <4 x i16>* %A
247 %tmp2 = load <4 x i16>, <4 x i16>* %B
248 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
252 define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
253 ; CHECK-LABEL: sqrdmulh_8h:
255 ; CHECK-NEXT: ldr q0, [x0]
256 ; CHECK-NEXT: ldr q1, [x1]
257 ; CHECK-NEXT: sqrdmulh.8h v0, v0, v1
259 %tmp1 = load <8 x i16>, <8 x i16>* %A
260 %tmp2 = load <8 x i16>, <8 x i16>* %B
261 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
265 define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
266 ; CHECK-LABEL: sqrdmulh_2s:
268 ; CHECK-NEXT: ldr d0, [x0]
269 ; CHECK-NEXT: ldr d1, [x1]
270 ; CHECK-NEXT: sqrdmulh.2s v0, v0, v1
272 %tmp1 = load <2 x i32>, <2 x i32>* %A
273 %tmp2 = load <2 x i32>, <2 x i32>* %B
274 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
278 define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
279 ; CHECK-LABEL: sqrdmulh_4s:
281 ; CHECK-NEXT: ldr q0, [x0]
282 ; CHECK-NEXT: ldr q1, [x1]
283 ; CHECK-NEXT: sqrdmulh.4s v0, v0, v1
285 %tmp1 = load <4 x i32>, <4 x i32>* %A
286 %tmp2 = load <4 x i32>, <4 x i32>* %B
287 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
291 define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
292 ; CHECK-LABEL: sqrdmulh_1s:
294 ; CHECK-NEXT: ldr w8, [x0]
295 ; CHECK-NEXT: ldr w9, [x1]
296 ; CHECK-NEXT: fmov s0, w8
297 ; CHECK-NEXT: fmov s1, w9
298 ; CHECK-NEXT: sqrdmulh s0, s0, s1
299 ; CHECK-NEXT: fmov w0, s0
301 %tmp1 = load i32, i32* %A
302 %tmp2 = load i32, i32* %B
303 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
307 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
308 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
309 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
310 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
311 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
313 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
314 ; CHECK-LABEL: fmulx_2s:
316 ; CHECK-NEXT: ldr d0, [x0]
317 ; CHECK-NEXT: ldr d1, [x1]
318 ; CHECK-NEXT: fmulx.2s v0, v0, v1
320 %tmp1 = load <2 x float>, <2 x float>* %A
321 %tmp2 = load <2 x float>, <2 x float>* %B
322 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
323 ret <2 x float> %tmp3
326 define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
327 ; CHECK-LABEL: fmulx_4s:
329 ; CHECK-NEXT: ldr q0, [x0]
330 ; CHECK-NEXT: ldr q1, [x1]
331 ; CHECK-NEXT: fmulx.4s v0, v0, v1
333 %tmp1 = load <4 x float>, <4 x float>* %A
334 %tmp2 = load <4 x float>, <4 x float>* %B
335 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
336 ret <4 x float> %tmp3
339 define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
340 ; CHECK-LABEL: fmulx_2d:
342 ; CHECK-NEXT: ldr q0, [x0]
343 ; CHECK-NEXT: ldr q1, [x1]
344 ; CHECK-NEXT: fmulx.2d v0, v0, v1
346 %tmp1 = load <2 x double>, <2 x double>* %A
347 %tmp2 = load <2 x double>, <2 x double>* %B
348 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
349 ret <2 x double> %tmp3
352 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
353 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
354 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
356 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
357 ; CHECK-LABEL: smlal4s:
359 ; CHECK-NEXT: ldr d1, [x0]
360 ; CHECK-NEXT: ldr d2, [x1]
361 ; CHECK-NEXT: ldr q0, [x2]
362 ; CHECK-NEXT: smlal.4s v0, v1, v2
364 %tmp1 = load <4 x i16>, <4 x i16>* %A
365 %tmp2 = load <4 x i16>, <4 x i16>* %B
366 %tmp3 = load <4 x i32>, <4 x i32>* %C
367 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
368 %tmp5 = add <4 x i32> %tmp3, %tmp4
372 define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
373 ; CHECK-LABEL: smlal2d:
375 ; CHECK-NEXT: ldr d1, [x0]
376 ; CHECK-NEXT: ldr d2, [x1]
377 ; CHECK-NEXT: ldr q0, [x2]
378 ; CHECK-NEXT: smlal.2d v0, v1, v2
380 %tmp1 = load <2 x i32>, <2 x i32>* %A
381 %tmp2 = load <2 x i32>, <2 x i32>* %B
382 %tmp3 = load <2 x i64>, <2 x i64>* %C
383 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
384 %tmp5 = add <2 x i64> %tmp3, %tmp4
388 define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
389 ; CHECK-LABEL: smlsl4s:
391 ; CHECK-NEXT: ldr d1, [x0]
392 ; CHECK-NEXT: ldr d2, [x1]
393 ; CHECK-NEXT: ldr q0, [x2]
394 ; CHECK-NEXT: smlsl.4s v0, v1, v2
396 %tmp1 = load <4 x i16>, <4 x i16>* %A
397 %tmp2 = load <4 x i16>, <4 x i16>* %B
398 %tmp3 = load <4 x i32>, <4 x i32>* %C
399 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
400 %tmp5 = sub <4 x i32> %tmp3, %tmp4
404 define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
405 ; CHECK-LABEL: smlsl2d:
407 ; CHECK-NEXT: ldr d1, [x0]
408 ; CHECK-NEXT: ldr d2, [x1]
409 ; CHECK-NEXT: ldr q0, [x2]
410 ; CHECK-NEXT: smlsl.2d v0, v1, v2
412 %tmp1 = load <2 x i32>, <2 x i32>* %A
413 %tmp2 = load <2 x i32>, <2 x i32>* %B
414 %tmp3 = load <2 x i64>, <2 x i64>* %C
415 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
416 %tmp5 = sub <2 x i64> %tmp3, %tmp4
420 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
421 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
422 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
423 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
425 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
426 ; CHECK-LABEL: sqdmlal4s:
428 ; CHECK-NEXT: ldr d1, [x0]
429 ; CHECK-NEXT: ldr d2, [x1]
430 ; CHECK-NEXT: ldr q0, [x2]
431 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2
433 %tmp1 = load <4 x i16>, <4 x i16>* %A
434 %tmp2 = load <4 x i16>, <4 x i16>* %B
435 %tmp3 = load <4 x i32>, <4 x i32>* %C
436 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
437 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
441 define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
442 ; CHECK-LABEL: sqdmlal2d:
444 ; CHECK-NEXT: ldr d1, [x0]
445 ; CHECK-NEXT: ldr d2, [x1]
446 ; CHECK-NEXT: ldr q0, [x2]
447 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2
449 %tmp1 = load <2 x i32>, <2 x i32>* %A
450 %tmp2 = load <2 x i32>, <2 x i32>* %B
451 %tmp3 = load <2 x i64>, <2 x i64>* %C
452 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
453 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
457 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
458 ; CHECK-LABEL: sqdmlal2_4s:
460 ; CHECK-NEXT: ldr q0, [x2]
461 ; CHECK-NEXT: ldr d1, [x0, #8]
462 ; CHECK-NEXT: ldr d2, [x1, #8]
463 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2
465 %load1 = load <8 x i16>, <8 x i16>* %A
466 %load2 = load <8 x i16>, <8 x i16>* %B
467 %tmp3 = load <4 x i32>, <4 x i32>* %C
468 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
469 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
470 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
471 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
475 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
476 ; CHECK-LABEL: sqdmlal2_2d:
478 ; CHECK-NEXT: ldr q0, [x2]
479 ; CHECK-NEXT: ldr d1, [x0, #8]
480 ; CHECK-NEXT: ldr d2, [x1, #8]
481 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2
483 %load1 = load <4 x i32>, <4 x i32>* %A
484 %load2 = load <4 x i32>, <4 x i32>* %B
485 %tmp3 = load <2 x i64>, <2 x i64>* %C
486 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
487 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
488 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
489 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
493 define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
494 ; CHECK-LABEL: sqdmlsl4s:
496 ; CHECK-NEXT: ldr d1, [x0]
497 ; CHECK-NEXT: ldr d2, [x1]
498 ; CHECK-NEXT: ldr q0, [x2]
499 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2
501 %tmp1 = load <4 x i16>, <4 x i16>* %A
502 %tmp2 = load <4 x i16>, <4 x i16>* %B
503 %tmp3 = load <4 x i32>, <4 x i32>* %C
504 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
505 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
509 define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
510 ; CHECK-LABEL: sqdmlsl2d:
512 ; CHECK-NEXT: ldr d1, [x0]
513 ; CHECK-NEXT: ldr d2, [x1]
514 ; CHECK-NEXT: ldr q0, [x2]
515 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2
517 %tmp1 = load <2 x i32>, <2 x i32>* %A
518 %tmp2 = load <2 x i32>, <2 x i32>* %B
519 %tmp3 = load <2 x i64>, <2 x i64>* %C
520 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
521 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
525 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
526 ; CHECK-LABEL: sqdmlsl2_4s:
528 ; CHECK-NEXT: ldr q0, [x2]
529 ; CHECK-NEXT: ldr d1, [x0, #8]
530 ; CHECK-NEXT: ldr d2, [x1, #8]
531 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2
533 %load1 = load <8 x i16>, <8 x i16>* %A
534 %load2 = load <8 x i16>, <8 x i16>* %B
535 %tmp3 = load <4 x i32>, <4 x i32>* %C
536 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
537 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
538 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
539 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
543 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
544 ; CHECK-LABEL: sqdmlsl2_2d:
546 ; CHECK-NEXT: ldr q0, [x2]
547 ; CHECK-NEXT: ldr d1, [x0, #8]
548 ; CHECK-NEXT: ldr d2, [x1, #8]
549 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2
551 %load1 = load <4 x i32>, <4 x i32>* %A
552 %load2 = load <4 x i32>, <4 x i32>* %B
553 %tmp3 = load <2 x i64>, <2 x i64>* %C
554 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
555 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
556 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
557 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
561 define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
562 ; CHECK-LABEL: umlal4s:
564 ; CHECK-NEXT: ldr d1, [x0]
565 ; CHECK-NEXT: ldr d2, [x1]
566 ; CHECK-NEXT: ldr q0, [x2]
567 ; CHECK-NEXT: umlal.4s v0, v1, v2
569 %tmp1 = load <4 x i16>, <4 x i16>* %A
570 %tmp2 = load <4 x i16>, <4 x i16>* %B
571 %tmp3 = load <4 x i32>, <4 x i32>* %C
572 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
573 %tmp5 = add <4 x i32> %tmp3, %tmp4
577 define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
578 ; CHECK-LABEL: umlal2d:
580 ; CHECK-NEXT: ldr d1, [x0]
581 ; CHECK-NEXT: ldr d2, [x1]
582 ; CHECK-NEXT: ldr q0, [x2]
583 ; CHECK-NEXT: umlal.2d v0, v1, v2
585 %tmp1 = load <2 x i32>, <2 x i32>* %A
586 %tmp2 = load <2 x i32>, <2 x i32>* %B
587 %tmp3 = load <2 x i64>, <2 x i64>* %C
588 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
589 %tmp5 = add <2 x i64> %tmp3, %tmp4
593 define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
594 ; CHECK-LABEL: umlsl4s:
596 ; CHECK-NEXT: ldr d1, [x0]
597 ; CHECK-NEXT: ldr d2, [x1]
598 ; CHECK-NEXT: ldr q0, [x2]
599 ; CHECK-NEXT: umlsl.4s v0, v1, v2
601 %tmp1 = load <4 x i16>, <4 x i16>* %A
602 %tmp2 = load <4 x i16>, <4 x i16>* %B
603 %tmp3 = load <4 x i32>, <4 x i32>* %C
604 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
605 %tmp5 = sub <4 x i32> %tmp3, %tmp4
609 define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
610 ; CHECK-LABEL: umlsl2d:
612 ; CHECK-NEXT: ldr d1, [x0]
613 ; CHECK-NEXT: ldr d2, [x1]
614 ; CHECK-NEXT: ldr q0, [x2]
615 ; CHECK-NEXT: umlsl.2d v0, v1, v2
617 %tmp1 = load <2 x i32>, <2 x i32>* %A
618 %tmp2 = load <2 x i32>, <2 x i32>* %B
619 %tmp3 = load <2 x i64>, <2 x i64>* %C
620 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
621 %tmp5 = sub <2 x i64> %tmp3, %tmp4
625 define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
626 ; CHECK-LABEL: fmla_2s:
628 ; CHECK-NEXT: ldr d1, [x0]
629 ; CHECK-NEXT: ldr d2, [x1]
630 ; CHECK-NEXT: ldr d0, [x2]
631 ; CHECK-NEXT: fmla.2s v0, v2, v1
633 %tmp1 = load <2 x float>, <2 x float>* %A
634 %tmp2 = load <2 x float>, <2 x float>* %B
635 %tmp3 = load <2 x float>, <2 x float>* %C
636 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
637 ret <2 x float> %tmp4
640 define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
641 ; CHECK-LABEL: fmla_4s:
643 ; CHECK-NEXT: ldr q1, [x0]
644 ; CHECK-NEXT: ldr q2, [x1]
645 ; CHECK-NEXT: ldr q0, [x2]
646 ; CHECK-NEXT: fmla.4s v0, v2, v1
648 %tmp1 = load <4 x float>, <4 x float>* %A
649 %tmp2 = load <4 x float>, <4 x float>* %B
650 %tmp3 = load <4 x float>, <4 x float>* %C
651 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
652 ret <4 x float> %tmp4
655 define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
656 ; CHECK-LABEL: fmla_2d:
658 ; CHECK-NEXT: ldr q1, [x0]
659 ; CHECK-NEXT: ldr q2, [x1]
660 ; CHECK-NEXT: ldr q0, [x2]
661 ; CHECK-NEXT: fmla.2d v0, v2, v1
663 %tmp1 = load <2 x double>, <2 x double>* %A
664 %tmp2 = load <2 x double>, <2 x double>* %B
665 %tmp3 = load <2 x double>, <2 x double>* %C
666 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
667 ret <2 x double> %tmp4
670 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
671 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
672 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
674 define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
675 ; CHECK-LABEL: fmls_2s:
677 ; CHECK-NEXT: ldr d1, [x0]
678 ; CHECK-NEXT: ldr d2, [x1]
679 ; CHECK-NEXT: ldr d0, [x2]
680 ; CHECK-NEXT: fmls.2s v0, v1, v2
682 %tmp1 = load <2 x float>, <2 x float>* %A
683 %tmp2 = load <2 x float>, <2 x float>* %B
684 %tmp3 = load <2 x float>, <2 x float>* %C
685 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
686 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
687 ret <2 x float> %tmp5
690 define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
691 ; CHECK-LABEL: fmls_4s:
693 ; CHECK-NEXT: ldr q1, [x0]
694 ; CHECK-NEXT: ldr q2, [x1]
695 ; CHECK-NEXT: ldr q0, [x2]
696 ; CHECK-NEXT: fmls.4s v0, v1, v2
698 %tmp1 = load <4 x float>, <4 x float>* %A
699 %tmp2 = load <4 x float>, <4 x float>* %B
700 %tmp3 = load <4 x float>, <4 x float>* %C
701 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
702 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
703 ret <4 x float> %tmp5
706 define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
707 ; CHECK-LABEL: fmls_2d:
709 ; CHECK-NEXT: ldr q1, [x0]
710 ; CHECK-NEXT: ldr q2, [x1]
711 ; CHECK-NEXT: ldr q0, [x2]
712 ; CHECK-NEXT: fmls.2d v0, v1, v2
714 %tmp1 = load <2 x double>, <2 x double>* %A
715 %tmp2 = load <2 x double>, <2 x double>* %B
716 %tmp3 = load <2 x double>, <2 x double>* %C
717 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
718 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
719 ret <2 x double> %tmp5
722 define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
723 ; CHECK-LABEL: fmls_commuted_neg_2s:
725 ; CHECK-NEXT: ldr d1, [x0]
726 ; CHECK-NEXT: ldr d2, [x1]
727 ; CHECK-NEXT: ldr d0, [x2]
728 ; CHECK-NEXT: fmls.2s v0, v2, v1
730 %tmp1 = load <2 x float>, <2 x float>* %A
731 %tmp2 = load <2 x float>, <2 x float>* %B
732 %tmp3 = load <2 x float>, <2 x float>* %C
733 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
734 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
735 ret <2 x float> %tmp5
738 define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
739 ; CHECK-LABEL: fmls_commuted_neg_4s:
741 ; CHECK-NEXT: ldr q1, [x0]
742 ; CHECK-NEXT: ldr q2, [x1]
743 ; CHECK-NEXT: ldr q0, [x2]
744 ; CHECK-NEXT: fmls.4s v0, v2, v1
746 %tmp1 = load <4 x float>, <4 x float>* %A
747 %tmp2 = load <4 x float>, <4 x float>* %B
748 %tmp3 = load <4 x float>, <4 x float>* %C
749 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
750 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
751 ret <4 x float> %tmp5
754 define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
755 ; CHECK-LABEL: fmls_commuted_neg_2d:
757 ; CHECK-NEXT: ldr q1, [x0]
758 ; CHECK-NEXT: ldr q2, [x1]
759 ; CHECK-NEXT: ldr q0, [x2]
760 ; CHECK-NEXT: fmls.2d v0, v2, v1
762 %tmp1 = load <2 x double>, <2 x double>* %A
763 %tmp2 = load <2 x double>, <2 x double>* %B
764 %tmp3 = load <2 x double>, <2 x double>* %C
765 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
766 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
767 ret <2 x double> %tmp5
770 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
771 ; CHECK-LABEL: fmls_indexed_2s:
772 ; CHECK: // %bb.0: // %entry
773 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
774 ; CHECK-NEXT: fmls.2s v0, v2, v1[0]
777 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
778 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
779 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
780 ret <2 x float> %fmls1
783 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
784 ; CHECK-LABEL: fmls_indexed_4s:
785 ; CHECK: // %bb.0: // %entry
786 ; CHECK-NEXT: fmls.4s v0, v2, v1[0]
789 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
790 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
791 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
792 ret <4 x float> %fmls1
795 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
796 ; CHECK-LABEL: fmls_indexed_2d:
797 ; CHECK: // %bb.0: // %entry
798 ; CHECK-NEXT: fmls.2d v0, v2, v1[0]
801 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
802 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
803 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
804 ret <2 x double> %fmls1
807 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
808 ; CHECK-LABEL: fmla_indexed_scalar_2s:
809 ; CHECK: // %bb.0: // %entry
810 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2
811 ; CHECK-NEXT: fmla.2s v0, v1, v2
814 %v1 = insertelement <2 x float> undef, float %c, i32 0
815 %v2 = insertelement <2 x float> %v1, float %c, i32 1
816 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
817 ret <2 x float> %fmla1
820 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
821 ; CHECK-LABEL: fmla_indexed_scalar_4s:
822 ; CHECK: // %bb.0: // %entry
823 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
824 ; CHECK-NEXT: fmla.4s v0, v1, v2[0]
827 %v1 = insertelement <4 x float> undef, float %c, i32 0
828 %v2 = insertelement <4 x float> %v1, float %c, i32 1
829 %v3 = insertelement <4 x float> %v2, float %c, i32 2
830 %v4 = insertelement <4 x float> %v3, float %c, i32 3
831 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
832 ret <4 x float> %fmla1
835 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
836 ; CHECK-LABEL: fmla_indexed_scalar_2d:
837 ; CHECK: // %bb.0: // %entry
838 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
839 ; CHECK-NEXT: fmla.2d v0, v1, v2[0]
842 %v1 = insertelement <2 x double> undef, double %c, i32 0
843 %v2 = insertelement <2 x double> %v1, double %c, i32 1
844 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
845 ret <2 x double> %fmla1
848 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
849 ; CHECK-LABEL: mul_4h:
851 ; CHECK-NEXT: ldr d0, [x0]
852 ; CHECK-NEXT: ldr d1, [x1]
853 ; CHECK-NEXT: mul.4h v0, v0, v1[1]
855 %tmp1 = load <4 x i16>, <4 x i16>* %A
856 %tmp2 = load <4 x i16>, <4 x i16>* %B
857 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
858 %tmp4 = mul <4 x i16> %tmp1, %tmp3
862 define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
863 ; CHECK-LABEL: mul_8h:
865 ; CHECK-NEXT: ldr q0, [x0]
866 ; CHECK-NEXT: ldr q1, [x1]
867 ; CHECK-NEXT: mul.8h v0, v0, v1[1]
869 %tmp1 = load <8 x i16>, <8 x i16>* %A
870 %tmp2 = load <8 x i16>, <8 x i16>* %B
871 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
872 %tmp4 = mul <8 x i16> %tmp1, %tmp3
876 define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
877 ; CHECK-LABEL: mul_2s:
879 ; CHECK-NEXT: ldr d0, [x0]
880 ; CHECK-NEXT: ldr d1, [x1]
881 ; CHECK-NEXT: mul.2s v0, v0, v1[1]
883 %tmp1 = load <2 x i32>, <2 x i32>* %A
884 %tmp2 = load <2 x i32>, <2 x i32>* %B
885 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
886 %tmp4 = mul <2 x i32> %tmp1, %tmp3
890 define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
891 ; CHECK-LABEL: mul_4s:
893 ; CHECK-NEXT: ldr q0, [x0]
894 ; CHECK-NEXT: ldr q1, [x1]
895 ; CHECK-NEXT: mul.4s v0, v0, v1[1]
897 %tmp1 = load <4 x i32>, <4 x i32>* %A
898 %tmp2 = load <4 x i32>, <4 x i32>* %B
899 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
900 %tmp4 = mul <4 x i32> %tmp1, %tmp3
904 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
905 ; CHECK-LABEL: mul_2d:
907 ; CHECK-NEXT: fmov x10, d1
908 ; CHECK-NEXT: fmov x11, d0
909 ; CHECK-NEXT: mov.d x8, v1[1]
910 ; CHECK-NEXT: mov.d x9, v0[1]
911 ; CHECK-NEXT: mul x10, x11, x10
912 ; CHECK-NEXT: mul x8, x9, x8
913 ; CHECK-NEXT: fmov d0, x10
914 ; CHECK-NEXT: mov.d v0[1], x8
916 %tmp1 = mul <2 x i64> %A, %B
920 define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
921 ; CHECK-LABEL: fmul_lane_2s:
923 ; CHECK-NEXT: ldr d0, [x0]
924 ; CHECK-NEXT: ldr d1, [x1]
925 ; CHECK-NEXT: fmul.2s v0, v0, v1[1]
927 %tmp1 = load <2 x float>, <2 x float>* %A
928 %tmp2 = load <2 x float>, <2 x float>* %B
929 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
930 %tmp4 = fmul <2 x float> %tmp1, %tmp3
931 ret <2 x float> %tmp4
934 define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
935 ; CHECK-LABEL: fmul_lane_4s:
937 ; CHECK-NEXT: ldr q0, [x0]
938 ; CHECK-NEXT: ldr q1, [x1]
939 ; CHECK-NEXT: fmul.4s v0, v0, v1[1]
941 %tmp1 = load <4 x float>, <4 x float>* %A
942 %tmp2 = load <4 x float>, <4 x float>* %B
943 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
944 %tmp4 = fmul <4 x float> %tmp1, %tmp3
945 ret <4 x float> %tmp4
948 define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
949 ; CHECK-LABEL: fmul_lane_2d:
951 ; CHECK-NEXT: ldr q0, [x0]
952 ; CHECK-NEXT: ldr q1, [x1]
953 ; CHECK-NEXT: fmul.2d v0, v0, v1[1]
955 %tmp1 = load <2 x double>, <2 x double>* %A
956 %tmp2 = load <2 x double>, <2 x double>* %B
957 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
958 %tmp4 = fmul <2 x double> %tmp1, %tmp3
959 ret <2 x double> %tmp4
962 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
963 ; CHECK-LABEL: fmul_lane_s:
965 ; CHECK-NEXT: fmul.s s0, s0, v1[3]
967 %B = extractelement <4 x float> %vec, i32 3
968 %res = fmul float %A, %B
972 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
973 ; CHECK-LABEL: fmul_lane_d:
975 ; CHECK-NEXT: fmul.d d0, d0, v1[1]
977 %B = extractelement <2 x double> %vec, i32 1
978 %res = fmul double %A, %B
984 define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
985 ; CHECK-LABEL: fmulx_lane_2s:
987 ; CHECK-NEXT: ldr d0, [x0]
988 ; CHECK-NEXT: ldr d1, [x1]
989 ; CHECK-NEXT: fmulx.2s v0, v0, v1[1]
991 %tmp1 = load <2 x float>, <2 x float>* %A
992 %tmp2 = load <2 x float>, <2 x float>* %B
993 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
994 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
995 ret <2 x float> %tmp4
998 define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
999 ; CHECK-LABEL: fmulx_lane_4s:
1001 ; CHECK-NEXT: ldr q0, [x0]
1002 ; CHECK-NEXT: ldr q1, [x1]
1003 ; CHECK-NEXT: fmulx.4s v0, v0, v1[1]
1005 %tmp1 = load <4 x float>, <4 x float>* %A
1006 %tmp2 = load <4 x float>, <4 x float>* %B
1007 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1008 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
1009 ret <4 x float> %tmp4
1012 define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
1013 ; CHECK-LABEL: fmulx_lane_2d:
1015 ; CHECK-NEXT: ldr q0, [x0]
1016 ; CHECK-NEXT: ldr q1, [x1]
1017 ; CHECK-NEXT: fmulx.2d v0, v0, v1[1]
1019 %tmp1 = load <2 x double>, <2 x double>* %A
1020 %tmp2 = load <2 x double>, <2 x double>* %B
1021 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
1022 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
1023 ret <2 x double> %tmp4
1026 define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1027 ; CHECK-LABEL: sqdmulh_lane_4h:
1029 ; CHECK-NEXT: ldr d0, [x0]
1030 ; CHECK-NEXT: ldr d1, [x1]
1031 ; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1]
1033 %tmp1 = load <4 x i16>, <4 x i16>* %A
1034 %tmp2 = load <4 x i16>, <4 x i16>* %B
1035 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1036 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
1040 define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1041 ; CHECK-LABEL: sqdmulh_lane_8h:
1043 ; CHECK-NEXT: ldr q0, [x0]
1044 ; CHECK-NEXT: ldr q1, [x1]
1045 ; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1]
1047 %tmp1 = load <8 x i16>, <8 x i16>* %A
1048 %tmp2 = load <8 x i16>, <8 x i16>* %B
1049 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1050 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
1054 define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1055 ; CHECK-LABEL: sqdmulh_lane_2s:
1057 ; CHECK-NEXT: ldr d0, [x0]
1058 ; CHECK-NEXT: ldr d1, [x1]
1059 ; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1]
1061 %tmp1 = load <2 x i32>, <2 x i32>* %A
1062 %tmp2 = load <2 x i32>, <2 x i32>* %B
1063 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1064 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
1068 define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1069 ; CHECK-LABEL: sqdmulh_lane_4s:
1071 ; CHECK-NEXT: ldr q0, [x0]
1072 ; CHECK-NEXT: ldr q1, [x1]
1073 ; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1]
1075 %tmp1 = load <4 x i32>, <4 x i32>* %A
1076 %tmp2 = load <4 x i32>, <4 x i32>* %B
1077 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1078 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
1082 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1083 ; CHECK-LABEL: sqdmulh_lane_1s:
1085 ; CHECK-NEXT: fmov s1, w0
1086 ; CHECK-NEXT: sqdmulh.s s0, s1, v0[1]
1087 ; CHECK-NEXT: fmov w0, s0
1089 %tmp1 = extractelement <4 x i32> %B, i32 1
1090 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
1094 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1095 ; CHECK-LABEL: sqrdmulh_lane_4h:
1097 ; CHECK-NEXT: ldr d0, [x0]
1098 ; CHECK-NEXT: ldr d1, [x1]
1099 ; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1]
1101 %tmp1 = load <4 x i16>, <4 x i16>* %A
1102 %tmp2 = load <4 x i16>, <4 x i16>* %B
1103 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1104 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
1108 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1109 ; CHECK-LABEL: sqrdmulh_lane_8h:
1111 ; CHECK-NEXT: ldr q0, [x0]
1112 ; CHECK-NEXT: ldr q1, [x1]
1113 ; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1]
1115 %tmp1 = load <8 x i16>, <8 x i16>* %A
1116 %tmp2 = load <8 x i16>, <8 x i16>* %B
1117 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1118 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
1122 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1123 ; CHECK-LABEL: sqrdmulh_lane_2s:
1125 ; CHECK-NEXT: ldr d0, [x0]
1126 ; CHECK-NEXT: ldr d1, [x1]
1127 ; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1]
1129 %tmp1 = load <2 x i32>, <2 x i32>* %A
1130 %tmp2 = load <2 x i32>, <2 x i32>* %B
1131 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1132 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
1136 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1137 ; CHECK-LABEL: sqrdmulh_lane_4s:
1139 ; CHECK-NEXT: ldr q0, [x0]
1140 ; CHECK-NEXT: ldr q1, [x1]
1141 ; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1]
1143 %tmp1 = load <4 x i32>, <4 x i32>* %A
1144 %tmp2 = load <4 x i32>, <4 x i32>* %B
1145 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1146 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
1150 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1151 ; CHECK-LABEL: sqrdmulh_lane_1s:
1153 ; CHECK-NEXT: fmov s1, w0
1154 ; CHECK-NEXT: sqrdmulh.s s0, s1, v0[1]
1155 ; CHECK-NEXT: fmov w0, s0
1157 %tmp1 = extractelement <4 x i32> %B, i32 1
1158 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
1162 define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1163 ; CHECK-LABEL: sqdmull_lane_4s:
1165 ; CHECK-NEXT: ldr d0, [x0]
1166 ; CHECK-NEXT: ldr d1, [x1]
1167 ; CHECK-NEXT: sqdmull.4s v0, v0, v1[1]
1169 %tmp1 = load <4 x i16>, <4 x i16>* %A
1170 %tmp2 = load <4 x i16>, <4 x i16>* %B
1171 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1172 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
1176 define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1177 ; CHECK-LABEL: sqdmull_lane_2d:
1179 ; CHECK-NEXT: ldr d0, [x0]
1180 ; CHECK-NEXT: ldr d1, [x1]
1181 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[1]
1183 %tmp1 = load <2 x i32>, <2 x i32>* %A
1184 %tmp2 = load <2 x i32>, <2 x i32>* %B
1185 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1186 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
1190 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1191 ; CHECK-LABEL: sqdmull2_lane_4s:
1193 ; CHECK-NEXT: ldr d0, [x0, #8]
1194 ; CHECK-NEXT: ldr d1, [x1]
1195 ; CHECK-NEXT: sqdmull.4s v0, v0, v1[1]
1197 %load1 = load <8 x i16>, <8 x i16>* %A
1198 %load2 = load <8 x i16>, <8 x i16>* %B
1199 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1200 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1201 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1205 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1206 ; CHECK-LABEL: sqdmull2_lane_2d:
1208 ; CHECK-NEXT: ldr d0, [x0, #8]
1209 ; CHECK-NEXT: ldr d1, [x1]
1210 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[1]
1212 %load1 = load <4 x i32>, <4 x i32>* %A
1213 %load2 = load <4 x i32>, <4 x i32>* %B
1214 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1215 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1216 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1220 define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1221 ; CHECK-LABEL: umull_lane_4s:
1223 ; CHECK-NEXT: ldr d0, [x0]
1224 ; CHECK-NEXT: ldr d1, [x1]
1225 ; CHECK-NEXT: umull.4s v0, v0, v1[1]
1227 %tmp1 = load <4 x i16>, <4 x i16>* %A
1228 %tmp2 = load <4 x i16>, <4 x i16>* %B
1229 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1230 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
1234 define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1235 ; CHECK-LABEL: umull_lane_2d:
1237 ; CHECK-NEXT: ldr d0, [x0]
1238 ; CHECK-NEXT: ldr d1, [x1]
1239 ; CHECK-NEXT: umull.2d v0, v0, v1[1]
1241 %tmp1 = load <2 x i32>, <2 x i32>* %A
1242 %tmp2 = load <2 x i32>, <2 x i32>* %B
1243 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1244 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
1248 define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1249 ; CHECK-LABEL: smull_lane_4s:
1251 ; CHECK-NEXT: ldr d0, [x0]
1252 ; CHECK-NEXT: ldr d1, [x1]
1253 ; CHECK-NEXT: smull.4s v0, v0, v1[1]
1255 %tmp1 = load <4 x i16>, <4 x i16>* %A
1256 %tmp2 = load <4 x i16>, <4 x i16>* %B
1257 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1258 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
1262 define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1263 ; CHECK-LABEL: smull_lane_2d:
1265 ; CHECK-NEXT: ldr d0, [x0]
1266 ; CHECK-NEXT: ldr d1, [x1]
1267 ; CHECK-NEXT: smull.2d v0, v0, v1[1]
1269 %tmp1 = load <2 x i32>, <2 x i32>* %A
1270 %tmp2 = load <2 x i32>, <2 x i32>* %B
1271 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1272 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
1276 define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1277 ; CHECK-LABEL: smlal_lane_4s:
1279 ; CHECK-NEXT: ldr d1, [x0]
1280 ; CHECK-NEXT: ldr d2, [x1]
1281 ; CHECK-NEXT: ldr q0, [x2]
1282 ; CHECK-NEXT: smlal.4s v0, v1, v2[1]
1284 %tmp1 = load <4 x i16>, <4 x i16>* %A
1285 %tmp2 = load <4 x i16>, <4 x i16>* %B
1286 %tmp3 = load <4 x i32>, <4 x i32>* %C
1287 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1288 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1289 %tmp6 = add <4 x i32> %tmp3, %tmp5
1293 define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1294 ; CHECK-LABEL: smlal_lane_2d:
1296 ; CHECK-NEXT: ldr d1, [x0]
1297 ; CHECK-NEXT: ldr d2, [x1]
1298 ; CHECK-NEXT: ldr q0, [x2]
1299 ; CHECK-NEXT: smlal.2d v0, v1, v2[1]
1301 %tmp1 = load <2 x i32>, <2 x i32>* %A
1302 %tmp2 = load <2 x i32>, <2 x i32>* %B
1303 %tmp3 = load <2 x i64>, <2 x i64>* %C
1304 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1305 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1306 %tmp6 = add <2 x i64> %tmp3, %tmp5
1310 define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1311 ; CHECK-LABEL: sqdmlal_lane_4s:
1313 ; CHECK-NEXT: ldr d1, [x0]
1314 ; CHECK-NEXT: ldr d2, [x1]
1315 ; CHECK-NEXT: ldr q0, [x2]
1316 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1]
1318 %tmp1 = load <4 x i16>, <4 x i16>* %A
1319 %tmp2 = load <4 x i16>, <4 x i16>* %B
1320 %tmp3 = load <4 x i32>, <4 x i32>* %C
1321 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1322 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1323 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1327 define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1328 ; CHECK-LABEL: sqdmlal_lane_2d:
1330 ; CHECK-NEXT: ldr d1, [x0]
1331 ; CHECK-NEXT: ldr d2, [x1]
1332 ; CHECK-NEXT: ldr q0, [x2]
1333 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1]
1335 %tmp1 = load <2 x i32>, <2 x i32>* %A
1336 %tmp2 = load <2 x i32>, <2 x i32>* %B
1337 %tmp3 = load <2 x i64>, <2 x i64>* %C
1338 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1339 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1340 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1344 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1345 ; CHECK-LABEL: sqdmlal2_lane_4s:
1347 ; CHECK-NEXT: ldr q0, [x2]
1348 ; CHECK-NEXT: ldr d1, [x0, #8]
1349 ; CHECK-NEXT: ldr d2, [x1]
1350 ; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1]
1352 %load1 = load <8 x i16>, <8 x i16>* %A
1353 %load2 = load <8 x i16>, <8 x i16>* %B
1354 %tmp3 = load <4 x i32>, <4 x i32>* %C
1355 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1356 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1357 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1358 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1362 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1363 ; CHECK-LABEL: sqdmlal2_lane_2d:
1365 ; CHECK-NEXT: ldr q0, [x2]
1366 ; CHECK-NEXT: ldr d1, [x0, #8]
1367 ; CHECK-NEXT: ldr d2, [x1]
1368 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1]
1370 %load1 = load <4 x i32>, <4 x i32>* %A
1371 %load2 = load <4 x i32>, <4 x i32>* %B
1372 %tmp3 = load <2 x i64>, <2 x i64>* %C
1373 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1374 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1375 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1376 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1380 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1381 ; CHECK-LABEL: sqdmlal_lane_1s:
1383 ; CHECK-NEXT: fmov s1, w1
1384 ; CHECK-NEXT: fmov s2, w0
1385 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1386 ; CHECK-NEXT: sqdmlal.4s v2, v1, v0[1]
1387 ; CHECK-NEXT: fmov w0, s2
1389 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1390 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1391 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1392 %prod = extractelement <4 x i32> %prod.vec, i32 0
1393 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1396 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1398 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1399 ; CHECK-LABEL: sqdmlsl_lane_1s:
1401 ; CHECK-NEXT: fmov s1, w1
1402 ; CHECK-NEXT: fmov s2, w0
1403 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1404 ; CHECK-NEXT: sqdmlsl.4s v2, v1, v0[1]
1405 ; CHECK-NEXT: fmov w0, s2
1407 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1408 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1409 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1410 %prod = extractelement <4 x i32> %prod.vec, i32 0
1411 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1414 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1416 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1417 ; CHECK-LABEL: sqdmlal_lane_1d:
1419 ; CHECK-NEXT: fmov d1, x0
1420 ; CHECK-NEXT: fmov s2, w1
1421 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1422 ; CHECK-NEXT: sqdmlal.s d1, s2, v0[1]
1423 ; CHECK-NEXT: fmov x0, d1
1425 %rhs = extractelement <2 x i32> %C, i32 1
1426 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1427 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1430 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1431 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1433 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1434 ; CHECK-LABEL: sqdmlsl_lane_1d:
1436 ; CHECK-NEXT: fmov d1, x0
1437 ; CHECK-NEXT: fmov s2, w1
1438 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1439 ; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1]
1440 ; CHECK-NEXT: fmov x0, d1
1442 %rhs = extractelement <2 x i32> %C, i32 1
1443 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1444 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1447 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1450 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1451 ; CHECK-LABEL: umlal_lane_4s:
1453 ; CHECK-NEXT: ldr d1, [x0]
1454 ; CHECK-NEXT: ldr d2, [x1]
1455 ; CHECK-NEXT: ldr q0, [x2]
1456 ; CHECK-NEXT: umlal.4s v0, v1, v2[1]
1458 %tmp1 = load <4 x i16>, <4 x i16>* %A
1459 %tmp2 = load <4 x i16>, <4 x i16>* %B
1460 %tmp3 = load <4 x i32>, <4 x i32>* %C
1461 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1462 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1463 %tmp6 = add <4 x i32> %tmp3, %tmp5
1467 define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1468 ; CHECK-LABEL: umlal_lane_2d:
1470 ; CHECK-NEXT: ldr d1, [x0]
1471 ; CHECK-NEXT: ldr d2, [x1]
1472 ; CHECK-NEXT: ldr q0, [x2]
1473 ; CHECK-NEXT: umlal.2d v0, v1, v2[1]
1475 %tmp1 = load <2 x i32>, <2 x i32>* %A
1476 %tmp2 = load <2 x i32>, <2 x i32>* %B
1477 %tmp3 = load <2 x i64>, <2 x i64>* %C
1478 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1479 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1480 %tmp6 = add <2 x i64> %tmp3, %tmp5
1485 define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1486 ; CHECK-LABEL: smlsl_lane_4s:
1488 ; CHECK-NEXT: ldr d1, [x0]
1489 ; CHECK-NEXT: ldr d2, [x1]
1490 ; CHECK-NEXT: ldr q0, [x2]
1491 ; CHECK-NEXT: smlsl.4s v0, v1, v2[1]
1493 %tmp1 = load <4 x i16>, <4 x i16>* %A
1494 %tmp2 = load <4 x i16>, <4 x i16>* %B
1495 %tmp3 = load <4 x i32>, <4 x i32>* %C
1496 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1497 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1498 %tmp6 = sub <4 x i32> %tmp3, %tmp5
1502 define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1503 ; CHECK-LABEL: smlsl_lane_2d:
1505 ; CHECK-NEXT: ldr d1, [x0]
1506 ; CHECK-NEXT: ldr d2, [x1]
1507 ; CHECK-NEXT: ldr q0, [x2]
1508 ; CHECK-NEXT: smlsl.2d v0, v1, v2[1]
1510 %tmp1 = load <2 x i32>, <2 x i32>* %A
1511 %tmp2 = load <2 x i32>, <2 x i32>* %B
1512 %tmp3 = load <2 x i64>, <2 x i64>* %C
1513 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1514 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1515 %tmp6 = sub <2 x i64> %tmp3, %tmp5
1519 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1520 ; CHECK-LABEL: sqdmlsl_lane_4s:
1522 ; CHECK-NEXT: ldr d1, [x0]
1523 ; CHECK-NEXT: ldr d2, [x1]
1524 ; CHECK-NEXT: ldr q0, [x2]
1525 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1]
1527 %tmp1 = load <4 x i16>, <4 x i16>* %A
1528 %tmp2 = load <4 x i16>, <4 x i16>* %B
1529 %tmp3 = load <4 x i32>, <4 x i32>* %C
1530 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1531 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1532 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1536 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1537 ; CHECK-LABEL: sqdmlsl_lane_2d:
1539 ; CHECK-NEXT: ldr d1, [x0]
1540 ; CHECK-NEXT: ldr d2, [x1]
1541 ; CHECK-NEXT: ldr q0, [x2]
1542 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1]
1544 %tmp1 = load <2 x i32>, <2 x i32>* %A
1545 %tmp2 = load <2 x i32>, <2 x i32>* %B
1546 %tmp3 = load <2 x i64>, <2 x i64>* %C
1547 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1548 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1549 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1553 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1554 ; CHECK-LABEL: sqdmlsl2_lane_4s:
1556 ; CHECK-NEXT: ldr q0, [x2]
1557 ; CHECK-NEXT: ldr d1, [x0, #8]
1558 ; CHECK-NEXT: ldr d2, [x1]
1559 ; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1]
1561 %load1 = load <8 x i16>, <8 x i16>* %A
1562 %load2 = load <8 x i16>, <8 x i16>* %B
1563 %tmp3 = load <4 x i32>, <4 x i32>* %C
1564 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1565 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1566 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1567 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1571 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1572 ; CHECK-LABEL: sqdmlsl2_lane_2d:
1574 ; CHECK-NEXT: ldr q0, [x2]
1575 ; CHECK-NEXT: ldr d1, [x0, #8]
1576 ; CHECK-NEXT: ldr d2, [x1]
1577 ; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1]
1579 %load1 = load <4 x i32>, <4 x i32>* %A
1580 %load2 = load <4 x i32>, <4 x i32>* %B
1581 %tmp3 = load <2 x i64>, <2 x i64>* %C
1582 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1583 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1584 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1585 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1589 define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1590 ; CHECK-LABEL: umlsl_lane_4s:
1592 ; CHECK-NEXT: ldr d1, [x0]
1593 ; CHECK-NEXT: ldr d2, [x1]
1594 ; CHECK-NEXT: ldr q0, [x2]
1595 ; CHECK-NEXT: umlsl.4s v0, v1, v2[1]
1597 %tmp1 = load <4 x i16>, <4 x i16>* %A
1598 %tmp2 = load <4 x i16>, <4 x i16>* %B
1599 %tmp3 = load <4 x i32>, <4 x i32>* %C
1600 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1601 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1602 %tmp6 = sub <4 x i32> %tmp3, %tmp5
1606 define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1607 ; CHECK-LABEL: umlsl_lane_2d:
1609 ; CHECK-NEXT: ldr d1, [x0]
1610 ; CHECK-NEXT: ldr d2, [x1]
1611 ; CHECK-NEXT: ldr q0, [x2]
1612 ; CHECK-NEXT: umlsl.2d v0, v1, v2[1]
1614 %tmp1 = load <2 x i32>, <2 x i32>* %A
1615 %tmp2 = load <2 x i32>, <2 x i32>* %B
1616 %tmp3 = load <2 x i64>, <2 x i64>* %C
1617 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1618 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1619 %tmp6 = sub <2 x i64> %tmp3, %tmp5
1624 define float @fmulxs(float %a, float %b) nounwind {
1625 ; CHECK-LABEL: fmulxs:
1627 ; CHECK-NEXT: fmulx s0, s0, s1
1629 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1633 define double @fmulxd(double %a, double %b) nounwind {
1634 ; CHECK-LABEL: fmulxd:
1636 ; CHECK-NEXT: fmulx d0, d0, d1
1638 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1642 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1643 ; CHECK-LABEL: fmulxs_lane:
1645 ; CHECK-NEXT: fmulx.s s0, s0, v1[3]
1647 %b = extractelement <4 x float> %vec, i32 3
1648 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1652 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1653 ; CHECK-LABEL: fmulxd_lane:
1655 ; CHECK-NEXT: fmulx.d d0, d0, v1[1]
1657 %b = extractelement <2 x double> %vec, i32 1
1658 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1662 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1663 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1666 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1667 ; CHECK-LABEL: smull2_8h_simple:
1669 ; CHECK-NEXT: smull2.8h v0, v0, v1
1671 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1672 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1673 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1677 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1678 ; CHECK-LABEL: foo0:
1680 ; CHECK-NEXT: smull2.8h v0, v0, v1
1682 %tmp = bitcast <16 x i8> %a to <2 x i64>
1683 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1684 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1685 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1686 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1687 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1688 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1689 ret <8 x i16> %vmull.i.i
1692 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1693 ; CHECK-LABEL: foo1:
1695 ; CHECK-NEXT: smull2.4s v0, v0, v1
1697 %tmp = bitcast <8 x i16> %a to <2 x i64>
1698 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1699 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1700 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1701 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1702 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1703 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1704 ret <4 x i32> %vmull2.i.i
1707 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1708 ; CHECK-LABEL: foo2:
1710 ; CHECK-NEXT: smull2.2d v0, v0, v1
1712 %tmp = bitcast <4 x i32> %a to <2 x i64>
1713 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1714 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1715 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1716 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1717 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1718 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1719 ret <2 x i64> %vmull2.i.i
1722 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1723 ; CHECK-LABEL: foo3:
1725 ; CHECK-NEXT: umull2.8h v0, v0, v1
1727 %tmp = bitcast <16 x i8> %a to <2 x i64>
1728 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1729 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1730 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1731 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1732 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1733 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1734 ret <8 x i16> %vmull.i.i
1737 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1738 ; CHECK-LABEL: foo4:
1740 ; CHECK-NEXT: umull2.4s v0, v0, v1
1742 %tmp = bitcast <8 x i16> %a to <2 x i64>
1743 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1744 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1745 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1746 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1747 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1748 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1749 ret <4 x i32> %vmull2.i.i
1752 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1753 ; CHECK-LABEL: foo5:
1755 ; CHECK-NEXT: umull2.2d v0, v0, v1
1757 %tmp = bitcast <4 x i32> %a to <2 x i64>
1758 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1759 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1760 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1761 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1762 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1763 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1764 ret <2 x i64> %vmull2.i.i
1767 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1768 ; CHECK-LABEL: foo6:
1769 ; CHECK: // %bb.0: // %entry
1770 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1771 ; CHECK-NEXT: smull2.4s v0, v1, v2[1]
1774 %0 = bitcast <8 x i16> %b to <2 x i64>
1775 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1776 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1777 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1778 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1779 ret <4 x i32> %vmull2.i
1782 define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1783 ; CHECK-LABEL: foo6a:
1784 ; CHECK: // %bb.0: // %entry
1785 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1786 ; CHECK-NEXT: smull.4s v0, v1, v2[1]
1789 %0 = bitcast <8 x i16> %b to <2 x i64>
1790 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1791 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1792 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1793 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1794 ret <4 x i32> %vmull2.i
1797 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1798 ; CHECK-LABEL: foo7:
1799 ; CHECK: // %bb.0: // %entry
1800 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1801 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
1804 %0 = bitcast <4 x i32> %b to <2 x i64>
1805 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1806 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1807 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1808 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1809 ret <2 x i64> %vmull2.i
1812 define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1813 ; CHECK-LABEL: foo7a:
1814 ; CHECK: // %bb.0: // %entry
1815 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1816 ; CHECK-NEXT: smull.2d v0, v1, v2[1]
1819 %0 = bitcast <4 x i32> %b to <2 x i64>
1820 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1821 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1822 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1823 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1824 ret <2 x i64> %vmull2.i
1828 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1829 ; CHECK-LABEL: foo8:
1830 ; CHECK: // %bb.0: // %entry
1831 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1832 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
1835 %0 = bitcast <8 x i16> %b to <2 x i64>
1836 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1837 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1838 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1839 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1840 ret <4 x i32> %vmull2.i
1843 define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1844 ; CHECK-LABEL: foo8a:
1845 ; CHECK: // %bb.0: // %entry
1846 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1847 ; CHECK-NEXT: umull.4s v0, v1, v2[1]
1850 %0 = bitcast <8 x i16> %b to <2 x i64>
1851 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1852 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1853 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1854 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1855 ret <4 x i32> %vmull2.i
1858 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1859 ; CHECK-LABEL: foo9:
1860 ; CHECK: // %bb.0: // %entry
1861 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1862 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
1865 %0 = bitcast <4 x i32> %b to <2 x i64>
1866 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1867 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1868 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1869 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1870 ret <2 x i64> %vmull2.i
1873 define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1874 ; CHECK-LABEL: foo9a:
1875 ; CHECK: // %bb.0: // %entry
1876 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1877 ; CHECK-NEXT: umull.2d v0, v1, v2[1]
1880 %0 = bitcast <4 x i32> %b to <2 x i64>
1881 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1882 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1883 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1884 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1885 ret <2 x i64> %vmull2.i
1888 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1889 ; CHECK-LABEL: bar0:
1891 ; CHECK-NEXT: smlal2.8h v0, v1, v2
1893 %tmp = bitcast <16 x i8> %b to <2 x i64>
1894 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1895 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1896 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1897 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1898 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1899 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1900 %add.i = add <8 x i16> %vmull.i.i.i, %a
1901 ret <8 x i16> %add.i
1904 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1905 ; CHECK-LABEL: bar1:
1907 ; CHECK-NEXT: smlal2.4s v0, v1, v2
1909 %tmp = bitcast <8 x i16> %b to <2 x i64>
1910 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1911 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1912 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1913 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1914 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1915 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1916 %add.i = add <4 x i32> %vmull2.i.i.i, %a
1917 ret <4 x i32> %add.i
1920 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1921 ; CHECK-LABEL: bar2:
1923 ; CHECK-NEXT: smlal2.2d v0, v1, v2
1925 %tmp = bitcast <4 x i32> %b to <2 x i64>
1926 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1927 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1928 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1929 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1930 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1931 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1932 %add.i = add <2 x i64> %vmull2.i.i.i, %a
1933 ret <2 x i64> %add.i
1936 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1937 ; CHECK-LABEL: bar3:
1939 ; CHECK-NEXT: umlal2.8h v0, v1, v2
1941 %tmp = bitcast <16 x i8> %b to <2 x i64>
1942 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1943 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1944 %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1945 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1946 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1947 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1948 %add.i = add <8 x i16> %vmull.i.i.i, %a
1949 ret <8 x i16> %add.i
1952 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1953 ; CHECK-LABEL: bar4:
1955 ; CHECK-NEXT: umlal2.4s v0, v1, v2
1957 %tmp = bitcast <8 x i16> %b to <2 x i64>
1958 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1959 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1960 %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1961 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1962 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1963 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1964 %add.i = add <4 x i32> %vmull2.i.i.i, %a
1965 ret <4 x i32> %add.i
1968 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1969 ; CHECK-LABEL: bar5:
1971 ; CHECK-NEXT: umlal2.2d v0, v1, v2
1973 %tmp = bitcast <4 x i32> %b to <2 x i64>
1974 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1975 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1976 %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1977 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1978 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1979 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1980 %add.i = add <2 x i64> %vmull2.i.i.i, %a
1981 ret <2 x i64> %add.i
1984 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1985 ; CHECK-LABEL: mlal2_1:
1987 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1988 ; CHECK-NEXT: smlal2.4s v0, v1, v2[3]
1990 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1991 %tmp = bitcast <8 x i16> %b to <2 x i64>
1992 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1993 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1994 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1995 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1996 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1997 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1998 %add = add <4 x i32> %vmull2.i.i, %a
2002 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2003 ; CHECK-LABEL: mlal2_2:
2005 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2006 ; CHECK-NEXT: smlal2.2d v0, v1, v2[1]
2008 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2009 %tmp = bitcast <4 x i32> %b to <2 x i64>
2010 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2011 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2012 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2013 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2014 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2015 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2016 %add = add <2 x i64> %vmull2.i.i, %a
2020 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
2021 ; CHECK-LABEL: mlal2_4:
2023 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2024 ; CHECK-NEXT: umlal2.4s v0, v1, v2[2]
2026 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2027 %tmp = bitcast <8 x i16> %b to <2 x i64>
2028 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2029 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2030 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
2031 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2032 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
2033 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2034 %add = add <4 x i32> %vmull2.i.i, %a
2038 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2039 ; CHECK-LABEL: mlal2_5:
2041 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2042 ; CHECK-NEXT: umlal2.2d v0, v1, v2[0]
2044 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
2045 %tmp = bitcast <4 x i32> %b to <2 x i64>
2046 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2047 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2048 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2049 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2050 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2051 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2052 %add = add <2 x i64> %vmull2.i.i, %a
2057 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
2058 ; CHECK-LABEL: vmulq_n_f64:
2059 ; CHECK: // %bb.0: // %entry
2060 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2061 ; CHECK-NEXT: fmul.2d v0, v0, v1[0]
2064 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
2065 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
2066 %mul.i = fmul <2 x double> %vecinit1.i, %x
2067 ret <2 x double> %mul.i
2070 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
2071 ; CHECK-LABEL: vmulq_n_f32:
2072 ; CHECK: // %bb.0: // %entry
2073 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
2074 ; CHECK-NEXT: fmul.4s v0, v0, v1[0]
2077 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
2078 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
2079 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
2080 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
2081 %mul.i = fmul <4 x float> %vecinit3.i, %x
2082 ret <4 x float> %mul.i
2085 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
2086 ; CHECK-LABEL: vmul_n_f32:
2087 ; CHECK: // %bb.0: // %entry
2088 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
2089 ; CHECK-NEXT: fmul.2s v0, v0, v1[0]
2092 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
2093 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
2094 %mul.i = fmul <2 x float> %vecinit1.i, %x
2095 ret <2 x float> %mul.i
2098 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2099 ; CHECK-LABEL: vmla_laneq_s16_test:
2100 ; CHECK: // %bb.0: // %entry
2101 ; CHECK-NEXT: mla.4h v0, v1, v2[6]
2104 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2105 %mul = mul <4 x i16> %shuffle, %b
2106 %add = add <4 x i16> %mul, %a
2110 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2111 ; CHECK-LABEL: vmla_laneq_s32_test:
2112 ; CHECK: // %bb.0: // %entry
2113 ; CHECK-NEXT: mla.2s v0, v1, v2[3]
2116 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
2117 %mul = mul <2 x i32> %shuffle, %b
2118 %add = add <2 x i32> %mul, %a
2122 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2123 ; CHECK-LABEL: not_really_vmlaq_laneq_s16_test:
2124 ; CHECK: // %bb.0: // %entry
2125 ; CHECK-NEXT: mla.8h v0, v1, v2[5]
2128 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2129 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2130 %mul = mul <8 x i16> %shuffle2, %b
2131 %add = add <8 x i16> %mul, %a
2135 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2136 ; CHECK-LABEL: not_really_vmlaq_laneq_s32_test:
2137 ; CHECK: // %bb.0: // %entry
2138 ; CHECK-NEXT: mla.4s v0, v1, v2[3]
2141 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2142 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2143 %mul = mul <4 x i32> %shuffle2, %b
2144 %add = add <4 x i32> %mul, %a
2148 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2149 ; CHECK-LABEL: vmull_laneq_s16_test:
2150 ; CHECK: // %bb.0: // %entry
2151 ; CHECK-NEXT: smull.4s v0, v0, v1[6]
2154 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2155 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2156 ret <4 x i32> %vmull2.i
2159 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2160 ; CHECK-LABEL: vmull_laneq_s32_test:
2161 ; CHECK: // %bb.0: // %entry
2162 ; CHECK-NEXT: smull.2d v0, v0, v1[2]
2165 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2166 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2167 ret <2 x i64> %vmull2.i
2169 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2170 ; CHECK-LABEL: vmull_laneq_u16_test:
2171 ; CHECK: // %bb.0: // %entry
2172 ; CHECK-NEXT: umull.4s v0, v0, v1[6]
2175 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2176 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2177 ret <4 x i32> %vmull2.i
2180 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2181 ; CHECK-LABEL: vmull_laneq_u32_test:
2182 ; CHECK: // %bb.0: // %entry
2183 ; CHECK-NEXT: umull.2d v0, v0, v1[2]
2186 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2187 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2188 ret <2 x i64> %vmull2.i
2191 define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2192 ; CHECK-LABEL: vmull_low_n_s16_test:
2193 ; CHECK: // %bb.0: // %entry
2194 ; CHECK-NEXT: dup.4h v0, w0
2195 ; CHECK-NEXT: smull.4s v0, v1, v0
2198 %conv = trunc i32 %d to i16
2199 %0 = bitcast <8 x i16> %b to <2 x i64>
2200 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
2201 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2202 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2203 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2204 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2205 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2206 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2207 ret <4 x i32> %vmull2.i.i
2210 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2211 ; CHECK-LABEL: vmull_high_n_s16_test:
2212 ; CHECK: // %bb.0: // %entry
2213 ; CHECK-NEXT: dup.8h v0, w0
2214 ; CHECK-NEXT: smull2.4s v0, v1, v0
2217 %conv = trunc i32 %d to i16
2218 %0 = bitcast <8 x i16> %b to <2 x i64>
2219 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2220 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2221 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2222 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2223 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2224 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2225 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2226 ret <4 x i32> %vmull2.i.i
2229 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2230 ; CHECK-LABEL: vmull_high_n_s32_test:
2231 ; CHECK: // %bb.0: // %entry
2232 ; CHECK-NEXT: dup.4s v0, w0
2233 ; CHECK-NEXT: smull2.2d v0, v1, v0
2236 %0 = bitcast <4 x i32> %b to <2 x i64>
2237 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2238 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2239 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2240 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2241 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2242 ret <2 x i64> %vmull2.i.i
2245 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2246 ; CHECK-LABEL: vmull_high_n_u16_test:
2247 ; CHECK: // %bb.0: // %entry
2248 ; CHECK-NEXT: dup.8h v0, w0
2249 ; CHECK-NEXT: umull2.4s v0, v1, v0
2252 %conv = trunc i32 %d to i16
2253 %0 = bitcast <8 x i16> %b to <2 x i64>
2254 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2255 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2256 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2257 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2258 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2259 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2260 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2261 ret <4 x i32> %vmull2.i.i
2264 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2265 ; CHECK-LABEL: vmull_high_n_u32_test:
2266 ; CHECK: // %bb.0: // %entry
2267 ; CHECK-NEXT: dup.4s v0, w0
2268 ; CHECK-NEXT: umull2.2d v0, v1, v0
2271 %0 = bitcast <4 x i32> %b to <2 x i64>
2272 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2273 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2274 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2275 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2276 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2277 ret <2 x i64> %vmull2.i.i
2280 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
2281 ; CHECK-LABEL: vmul_built_dup_test:
2283 ; CHECK-NEXT: mul.4s v0, v0, v1[1]
2285 %vget_lane = extractelement <4 x i32> %b, i32 1
2286 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
2287 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
2288 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
2289 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
2290 %prod = mul <4 x i32> %a, %vecinit3.i
2294 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
2295 ; CHECK-LABEL: vmul_built_dup_fromsmall_test:
2297 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2298 ; CHECK-NEXT: mul.4h v0, v0, v1[3]
2300 %vget_lane = extractelement <4 x i16> %b, i32 3
2301 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
2302 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
2303 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2304 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2305 %prod = mul <4 x i16> %a, %vecinit3.i
2309 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
2310 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
2312 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2313 ; CHECK-NEXT: mul.8h v0, v0, v1[0]
2315 %vget_lane = extractelement <4 x i16> %b, i32 0
2316 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
2317 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
2318 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2319 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2320 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
2321 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
2322 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
2323 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
2324 %prod = mul <8 x i16> %a, %vecinit7.i
2328 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
2329 ; CHECK-LABEL: mull_from_two_extracts:
2331 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1
2333 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2334 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2336 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2340 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2341 ; CHECK-LABEL: mlal_from_two_extracts:
2343 ; CHECK-NEXT: sqdmlal2.2d v0, v1, v2
2345 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2346 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2348 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2349 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2353 define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
2354 ; CHECK-LABEL: mull_from_extract_dup_low:
2356 ; CHECK-NEXT: dup.2s v1, w0
2357 ; CHECK-NEXT: sqdmull.2d v0, v0, v1
2359 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2360 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2362 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2364 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2368 define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
2369 ; CHECK-LABEL: mull_from_extract_dup_high:
2371 ; CHECK-NEXT: dup.4s v1, w0
2372 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1
2374 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2375 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2377 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2379 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2383 define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
2384 ; CHECK-LABEL: pmull_from_extract_dup_low:
2386 ; CHECK-NEXT: dup.8b v1, w0
2387 ; CHECK-NEXT: pmull.8h v0, v0, v1
2389 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2390 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2392 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2394 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2398 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
2399 ; CHECK-LABEL: pmull_from_extract_dup_high:
2401 ; CHECK-NEXT: dup.16b v1, w0
2402 ; CHECK-NEXT: pmull2.8h v0, v0, v1
2404 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2405 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2407 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2409 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2413 define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
2414 ; CHECK-LABEL: pmull_from_extract_duplane_low:
2416 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2417 ; CHECK-NEXT: dup.8b v1, v1[0]
2418 ; CHECK-NEXT: pmull.8h v0, v0, v1
2420 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2421 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2423 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2427 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
2428 ; CHECK-LABEL: pmull_from_extract_duplane_high:
2430 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2431 ; CHECK-NEXT: dup.16b v1, v1[0]
2432 ; CHECK-NEXT: pmull2.8h v0, v0, v1
2434 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2435 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2437 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2441 define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
2442 ; CHECK-LABEL: sqdmull_from_extract_duplane_low:
2444 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[0]
2446 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2447 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2449 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2453 define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
2454 ; CHECK-LABEL: sqdmull_from_extract_duplane_high:
2456 ; CHECK-NEXT: sqdmull2.2d v0, v0, v1[0]
2458 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2459 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2461 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2465 define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2466 ; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
2468 ; CHECK-NEXT: sqdmlal.2d v0, v1, v2[0]
2470 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2471 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2473 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2474 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2478 define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2479 ; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
2481 ; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[0]
2483 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2484 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2486 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2487 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2491 define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2492 ; CHECK-LABEL: umlal_from_extract_duplane_low:
2494 ; CHECK-NEXT: umlal.2d v0, v1, v2[0]
2496 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2497 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2499 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2500 %sum = add <2 x i64> %accum, %res
2504 define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2505 ; CHECK-LABEL: umlal_from_extract_duplane_high:
2507 ; CHECK-NEXT: umlal2.2d v0, v1, v2[0]
2509 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2510 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2512 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2513 %sum = add <2 x i64> %accum, %res
2517 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2518 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
2520 ; CHECK-NEXT: fmla.s s0, s1, v2[3]
2522 %rhs = extractelement <4 x float> %rvec, i32 3
2523 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2527 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2528 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
2530 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2531 ; CHECK-NEXT: fmla.s s0, s1, v2[1]
2533 %rhs = extractelement <2 x float> %rvec, i32 1
2534 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2538 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2539 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
2541 ; CHECK-NEXT: fmls.s s0, s1, v2[3]
2543 %rhs.scal = extractelement <4 x float> %rvec, i32 3
2544 %rhs = fsub float -0.0, %rhs.scal
2545 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2549 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2550 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
2552 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2553 ; CHECK-NEXT: fmls.s s0, s1, v2[1]
2555 %rhs.scal = extractelement <2 x float> %rvec, i32 1
2556 %rhs = fsub float -0.0, %rhs.scal
2557 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2561 declare float @llvm.fma.f32(float, float, float)
2563 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2564 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
2566 ; CHECK-NEXT: fmla.d d0, d1, v2[1]
2568 %rhs = extractelement <2 x double> %rvec, i32 1
2569 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2573 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2574 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
2576 ; CHECK-NEXT: fmls.d d0, d1, v2[1]
2578 %rhs.scal = extractelement <2 x double> %rvec, i32 1
2579 %rhs = fsub double -0.0, %rhs.scal
2580 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2584 declare double @llvm.fma.f64(double, double, double)
2586 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
2587 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
2589 ; CHECK-NEXT: fmls.2s v0, v1, v2[3]
2591 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2592 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2593 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2594 ret <2 x float> %res
2597 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
2598 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
2600 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2601 ; CHECK-NEXT: fmls.2s v0, v1, v2[1]
2603 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2604 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2605 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2606 ret <2 x float> %res
2609 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
2610 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
2612 ; CHECK-NEXT: fmls.4s v0, v1, v2[3]
2614 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2615 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2616 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2617 ret <4 x float> %res
2620 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
2621 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
2623 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2624 ; CHECK-NEXT: fmls.4s v0, v1, v2[1]
2626 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2627 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2628 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2629 ret <4 x float> %res
2632 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
2633 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
2635 ; CHECK-NEXT: fmls.2d v0, v1, v2[1]
2637 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
2638 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2639 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
2640 ret <2 x double> %res
2643 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2644 ; CHECK-LABEL: test_fmul_v1f64:
2646 ; CHECK-NEXT: fmul d0, d0, d1
2648 %prod = fmul <1 x double> %L, %R
2649 ret <1 x double> %prod
2652 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2653 ; CHECK-LABEL: test_fdiv_v1f64:
2655 ; CHECK-NEXT: fdiv d0, d0, d1
2657 %prod = fdiv <1 x double> %L, %R
2658 ret <1 x double> %prod
2661 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2662 ; CHECK-LABEL: sqdmlal_d:
2664 ; CHECK-NEXT: fmov d0, x2
2665 ; CHECK-NEXT: fmov s1, w0
2666 ; CHECK-NEXT: fmov s2, w1
2667 ; CHECK-NEXT: sqdmlal d0, s1, s2
2668 ; CHECK-NEXT: fmov x0, d0
2670 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2671 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2675 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2676 ; CHECK-LABEL: sqdmlsl_d:
2678 ; CHECK-NEXT: fmov d0, x2
2679 ; CHECK-NEXT: fmov s1, w0
2680 ; CHECK-NEXT: fmov s2, w1
2681 ; CHECK-NEXT: sqdmlsl d0, s1, s2
2682 ; CHECK-NEXT: fmov x0, d0
2684 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2685 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2689 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2690 ; CHECK-LABEL: test_pmull_64:
2692 ; CHECK-NEXT: fmov d0, x0
2693 ; CHECK-NEXT: fmov d1, x1
2694 ; CHECK-NEXT: pmull.1q v0, v0, v1
2696 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2700 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2701 ; CHECK-LABEL: test_pmull_high_64:
2703 ; CHECK-NEXT: pmull2.1q v0, v0, v1
2705 %l_hi = extractelement <2 x i64> %l, i32 1
2706 %r_hi = extractelement <2 x i64> %r, i32 1
2707 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2711 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2713 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2714 ; CHECK-LABEL: test_mul_v1i64:
2716 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2717 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
2718 ; CHECK-NEXT: fmov x8, d1
2719 ; CHECK-NEXT: fmov x9, d0
2720 ; CHECK-NEXT: mul x8, x9, x8
2721 ; CHECK-NEXT: fmov d0, x8
2723 %prod = mul <1 x i64> %lhs, %rhs