1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,GENERIC
3 ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
4 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1
5 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m3 | FileCheck %s --check-prefixes=CHECK,EXYNOSM3
7 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
9 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
11 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
13 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
15 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
17 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
19 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
21 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
23 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
25 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
27 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
29 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
31 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
33 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
35 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
37 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
39 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
41 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
43 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
45 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
47 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
49 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
50 ; CHECK-LABEL: test_vmla_lane_s16:
51 ; CHECK: // %bb.0: // %entry
52 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
53 ; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[3]
56 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
57 %mul = mul <4 x i16> %shuffle, %b
58 %add = add <4 x i16> %mul, %a
62 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
63 ; CHECK-LABEL: test_vmlaq_lane_s16:
64 ; CHECK: // %bb.0: // %entry
65 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
66 ; CHECK-NEXT: mla v0.8h, v1.8h, v2.h[3]
69 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
70 %mul = mul <8 x i16> %shuffle, %b
71 %add = add <8 x i16> %mul, %a
75 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
76 ; CHECK-LABEL: test_vmla_lane_s32:
77 ; CHECK: // %bb.0: // %entry
78 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
79 ; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[1]
82 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
83 %mul = mul <2 x i32> %shuffle, %b
84 %add = add <2 x i32> %mul, %a
88 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
89 ; CHECK-LABEL: test_vmlaq_lane_s32:
90 ; CHECK: // %bb.0: // %entry
91 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
92 ; CHECK-NEXT: mla v0.4s, v1.4s, v2.s[1]
95 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
96 %mul = mul <4 x i32> %shuffle, %b
97 %add = add <4 x i32> %mul, %a
101 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
102 ; CHECK-LABEL: test_vmla_laneq_s16:
103 ; CHECK: // %bb.0: // %entry
104 ; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[7]
107 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
108 %mul = mul <4 x i16> %shuffle, %b
109 %add = add <4 x i16> %mul, %a
113 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
114 ; CHECK-LABEL: test_vmlaq_laneq_s16:
115 ; CHECK: // %bb.0: // %entry
116 ; CHECK-NEXT: mla v0.8h, v1.8h, v2.h[7]
119 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
120 %mul = mul <8 x i16> %shuffle, %b
121 %add = add <8 x i16> %mul, %a
125 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
126 ; CHECK-LABEL: test_vmla_laneq_s32:
127 ; CHECK: // %bb.0: // %entry
128 ; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[3]
131 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
132 %mul = mul <2 x i32> %shuffle, %b
133 %add = add <2 x i32> %mul, %a
137 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
138 ; CHECK-LABEL: test_vmlaq_laneq_s32:
139 ; CHECK: // %bb.0: // %entry
140 ; CHECK-NEXT: mla v0.4s, v1.4s, v2.s[3]
143 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
144 %mul = mul <4 x i32> %shuffle, %b
145 %add = add <4 x i32> %mul, %a
149 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
150 ; CHECK-LABEL: test_vmls_lane_s16:
151 ; CHECK: // %bb.0: // %entry
152 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
153 ; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[3]
156 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
157 %mul = mul <4 x i16> %shuffle, %b
158 %sub = sub <4 x i16> %a, %mul
162 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
163 ; CHECK-LABEL: test_vmlsq_lane_s16:
164 ; CHECK: // %bb.0: // %entry
165 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
166 ; CHECK-NEXT: mls v0.8h, v1.8h, v2.h[3]
169 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
170 %mul = mul <8 x i16> %shuffle, %b
171 %sub = sub <8 x i16> %a, %mul
175 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
176 ; CHECK-LABEL: test_vmls_lane_s32:
177 ; CHECK: // %bb.0: // %entry
178 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
179 ; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[1]
182 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
183 %mul = mul <2 x i32> %shuffle, %b
184 %sub = sub <2 x i32> %a, %mul
188 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
189 ; CHECK-LABEL: test_vmlsq_lane_s32:
190 ; CHECK: // %bb.0: // %entry
191 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
192 ; CHECK-NEXT: mls v0.4s, v1.4s, v2.s[1]
195 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
196 %mul = mul <4 x i32> %shuffle, %b
197 %sub = sub <4 x i32> %a, %mul
201 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
202 ; CHECK-LABEL: test_vmls_laneq_s16:
203 ; CHECK: // %bb.0: // %entry
204 ; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[7]
207 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
208 %mul = mul <4 x i16> %shuffle, %b
209 %sub = sub <4 x i16> %a, %mul
213 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
214 ; CHECK-LABEL: test_vmlsq_laneq_s16:
215 ; CHECK: // %bb.0: // %entry
216 ; CHECK-NEXT: mls v0.8h, v1.8h, v2.h[7]
219 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
220 %mul = mul <8 x i16> %shuffle, %b
221 %sub = sub <8 x i16> %a, %mul
225 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
226 ; CHECK-LABEL: test_vmls_laneq_s32:
227 ; CHECK: // %bb.0: // %entry
228 ; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[3]
231 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
232 %mul = mul <2 x i32> %shuffle, %b
233 %sub = sub <2 x i32> %a, %mul
237 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
238 ; CHECK-LABEL: test_vmlsq_laneq_s32:
239 ; CHECK: // %bb.0: // %entry
240 ; CHECK-NEXT: mls v0.4s, v1.4s, v2.s[3]
243 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
244 %mul = mul <4 x i32> %shuffle, %b
245 %sub = sub <4 x i32> %a, %mul
249 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
250 ; CHECK-LABEL: test_vmul_lane_s16:
251 ; CHECK: // %bb.0: // %entry
252 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
253 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[3]
256 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
257 %mul = mul <4 x i16> %shuffle, %a
261 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
262 ; CHECK-LABEL: test_vmulq_lane_s16:
263 ; CHECK: // %bb.0: // %entry
264 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
265 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[3]
268 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
269 %mul = mul <8 x i16> %shuffle, %a
273 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
274 ; CHECK-LABEL: test_vmul_lane_s32:
275 ; CHECK: // %bb.0: // %entry
276 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
277 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[1]
280 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
281 %mul = mul <2 x i32> %shuffle, %a
285 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
286 ; CHECK-LABEL: test_vmulq_lane_s32:
287 ; CHECK: // %bb.0: // %entry
288 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
289 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[1]
292 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
293 %mul = mul <4 x i32> %shuffle, %a
297 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
298 ; CHECK-LABEL: test_vmul_lane_u16:
299 ; CHECK: // %bb.0: // %entry
300 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
301 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[3]
304 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
305 %mul = mul <4 x i16> %shuffle, %a
309 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
310 ; CHECK-LABEL: test_vmulq_lane_u16:
311 ; CHECK: // %bb.0: // %entry
312 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
313 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[3]
316 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
317 %mul = mul <8 x i16> %shuffle, %a
321 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
322 ; CHECK-LABEL: test_vmul_lane_u32:
323 ; CHECK: // %bb.0: // %entry
324 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
325 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[1]
328 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
329 %mul = mul <2 x i32> %shuffle, %a
333 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
334 ; CHECK-LABEL: test_vmulq_lane_u32:
335 ; CHECK: // %bb.0: // %entry
336 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
337 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[1]
340 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
341 %mul = mul <4 x i32> %shuffle, %a
345 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
346 ; CHECK-LABEL: test_vmul_laneq_s16:
347 ; CHECK: // %bb.0: // %entry
348 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7]
351 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
352 %mul = mul <4 x i16> %shuffle, %a
356 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
357 ; CHECK-LABEL: test_vmulq_laneq_s16:
358 ; CHECK: // %bb.0: // %entry
359 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[7]
362 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
363 %mul = mul <8 x i16> %shuffle, %a
367 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
368 ; CHECK-LABEL: test_vmul_laneq_s32:
369 ; CHECK: // %bb.0: // %entry
370 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3]
373 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
374 %mul = mul <2 x i32> %shuffle, %a
378 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
379 ; CHECK-LABEL: test_vmulq_laneq_s32:
380 ; CHECK: // %bb.0: // %entry
381 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[3]
384 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
385 %mul = mul <4 x i32> %shuffle, %a
389 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
390 ; CHECK-LABEL: test_vmul_laneq_u16:
391 ; CHECK: // %bb.0: // %entry
392 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7]
395 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
396 %mul = mul <4 x i16> %shuffle, %a
400 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
401 ; CHECK-LABEL: test_vmulq_laneq_u16:
402 ; CHECK: // %bb.0: // %entry
403 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[7]
406 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
407 %mul = mul <8 x i16> %shuffle, %a
411 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
412 ; CHECK-LABEL: test_vmul_laneq_u32:
413 ; CHECK: // %bb.0: // %entry
414 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3]
417 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
418 %mul = mul <2 x i32> %shuffle, %a
422 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
423 ; CHECK-LABEL: test_vmulq_laneq_u32:
424 ; CHECK: // %bb.0: // %entry
425 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[3]
428 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
429 %mul = mul <4 x i32> %shuffle, %a
433 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
434 ; GENERIC-LABEL: test_vfma_lane_f32:
435 ; GENERIC: // %bb.0: // %entry
436 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
437 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[1]
440 ; EXYNOSM1-LABEL: test_vfma_lane_f32:
441 ; EXYNOSM1: // %bb.0: // %entry
442 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
443 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[1]
444 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
447 ; EXYNOSM3-LABEL: test_vfma_lane_f32:
448 ; EXYNOSM3: // %bb.0: // %entry
449 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
450 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[1]
453 %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
454 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
458 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
460 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
461 ; GENERIC-LABEL: test_vfmaq_lane_f32:
462 ; GENERIC: // %bb.0: // %entry
463 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
464 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v2.s[1]
467 ; EXYNOSM1-LABEL: test_vfmaq_lane_f32:
468 ; EXYNOSM1: // %bb.0: // %entry
469 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
470 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[1]
471 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v2.4s
474 ; EXYNOSM3-LABEL: test_vfmaq_lane_f32:
475 ; EXYNOSM3: // %bb.0: // %entry
476 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
477 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v2.s[1]
480 %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
481 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
485 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
487 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
488 ; GENERIC-LABEL: test_vfma_laneq_f32:
489 ; GENERIC: // %bb.0: // %entry
490 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[3]
493 ; EXYNOSM1-LABEL: test_vfma_laneq_f32:
494 ; EXYNOSM1: // %bb.0: // %entry
495 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[3]
496 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
499 ; EXYNOSM3-LABEL: test_vfma_laneq_f32:
500 ; EXYNOSM3: // %bb.0: // %entry
501 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[3]
504 %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
505 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
509 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
510 ; GENERIC-LABEL: test_vfmaq_laneq_f32:
511 ; GENERIC: // %bb.0: // %entry
512 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v2.s[3]
515 ; EXYNOSM1-LABEL: test_vfmaq_laneq_f32:
516 ; EXYNOSM1: // %bb.0: // %entry
517 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[3]
518 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v2.4s
521 ; EXYNOSM3-LABEL: test_vfmaq_laneq_f32:
522 ; EXYNOSM3: // %bb.0: // %entry
523 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v2.s[3]
526 %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
527 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
531 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
532 ; GENERIC-LABEL: test_vfms_lane_f32:
533 ; GENERIC: // %bb.0: // %entry
534 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
535 ; GENERIC-NEXT: fmls v0.2s, v1.2s, v2.s[1]
538 ; EXYNOSM1-LABEL: test_vfms_lane_f32:
539 ; EXYNOSM1: // %bb.0: // %entry
540 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
541 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[1]
542 ; EXYNOSM1-NEXT: fmls v0.2s, v1.2s, v2.2s
545 ; EXYNOSM3-LABEL: test_vfms_lane_f32:
546 ; EXYNOSM3: // %bb.0: // %entry
547 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
548 ; EXYNOSM3-NEXT: fmls v0.2s, v1.2s, v2.s[1]
551 %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
552 %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
553 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
557 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
558 ; GENERIC-LABEL: test_vfmsq_lane_f32:
559 ; GENERIC: // %bb.0: // %entry
560 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
561 ; GENERIC-NEXT: fmls v0.4s, v1.4s, v2.s[1]
564 ; EXYNOSM1-LABEL: test_vfmsq_lane_f32:
565 ; EXYNOSM1: // %bb.0: // %entry
566 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
567 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[1]
568 ; EXYNOSM1-NEXT: fmls v0.4s, v1.4s, v2.4s
571 ; EXYNOSM3-LABEL: test_vfmsq_lane_f32:
572 ; EXYNOSM3: // %bb.0: // %entry
573 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
574 ; EXYNOSM3-NEXT: fmls v0.4s, v1.4s, v2.s[1]
577 %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
578 %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
579 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
583 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
584 ; GENERIC-LABEL: test_vfms_laneq_f32:
585 ; GENERIC: // %bb.0: // %entry
586 ; GENERIC-NEXT: fmls v0.2s, v1.2s, v2.s[3]
589 ; EXYNOSM1-LABEL: test_vfms_laneq_f32:
590 ; EXYNOSM1: // %bb.0: // %entry
591 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[3]
592 ; EXYNOSM1-NEXT: fmls v0.2s, v1.2s, v2.2s
595 ; EXYNOSM3-LABEL: test_vfms_laneq_f32:
596 ; EXYNOSM3: // %bb.0: // %entry
597 ; EXYNOSM3-NEXT: fmls v0.2s, v1.2s, v2.s[3]
600 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
601 %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
602 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
606 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
607 ; GENERIC-LABEL: test_vfmsq_laneq_f32:
608 ; GENERIC: // %bb.0: // %entry
609 ; GENERIC-NEXT: fmls v0.4s, v1.4s, v2.s[3]
612 ; EXYNOSM1-LABEL: test_vfmsq_laneq_f32:
613 ; EXYNOSM1: // %bb.0: // %entry
614 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[3]
615 ; EXYNOSM1-NEXT: fmls v0.4s, v1.4s, v2.4s
618 ; EXYNOSM3-LABEL: test_vfmsq_laneq_f32:
619 ; EXYNOSM3: // %bb.0: // %entry
620 ; EXYNOSM3-NEXT: fmls v0.4s, v1.4s, v2.s[3]
623 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
624 %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
625 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
629 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
630 ; GENERIC-LABEL: test_vfmaq_lane_f64:
631 ; GENERIC: // %bb.0: // %entry
632 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
633 ; GENERIC-NEXT: fmla v0.2d, v1.2d, v2.d[0]
636 ; EXYNOSM1-LABEL: test_vfmaq_lane_f64:
637 ; EXYNOSM1: // %bb.0: // %entry
638 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
639 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[0]
640 ; EXYNOSM1-NEXT: fmla v0.2d, v1.2d, v2.2d
643 ; EXYNOSM3-LABEL: test_vfmaq_lane_f64:
644 ; EXYNOSM3: // %bb.0: // %entry
645 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
646 ; EXYNOSM3-NEXT: fmla v0.2d, v1.2d, v2.d[0]
649 %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
650 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
654 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
656 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
657 ; GENERIC-LABEL: test_vfmaq_laneq_f64:
658 ; GENERIC: // %bb.0: // %entry
659 ; GENERIC-NEXT: fmla v0.2d, v1.2d, v2.d[1]
662 ; EXYNOSM1-LABEL: test_vfmaq_laneq_f64:
663 ; EXYNOSM1: // %bb.0: // %entry
664 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[1]
665 ; EXYNOSM1-NEXT: fmla v0.2d, v1.2d, v2.2d
668 ; EXYNOSM3-LABEL: test_vfmaq_laneq_f64:
669 ; EXYNOSM3: // %bb.0: // %entry
670 ; EXYNOSM3-NEXT: fmla v0.2d, v1.2d, v2.d[1]
673 %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
674 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
678 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
679 ; GENERIC-LABEL: test_vfmsq_lane_f64:
680 ; GENERIC: // %bb.0: // %entry
681 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
682 ; GENERIC-NEXT: fmls v0.2d, v1.2d, v2.d[0]
685 ; EXYNOSM1-LABEL: test_vfmsq_lane_f64:
686 ; EXYNOSM1: // %bb.0: // %entry
687 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
688 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[0]
689 ; EXYNOSM1-NEXT: fmls v0.2d, v1.2d, v2.2d
692 ; EXYNOSM3-LABEL: test_vfmsq_lane_f64:
693 ; EXYNOSM3: // %bb.0: // %entry
694 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
695 ; EXYNOSM3-NEXT: fmls v0.2d, v1.2d, v2.d[0]
698 %sub = fsub <1 x double> <double -0.000000e+00>, %v
699 %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
700 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
704 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
705 ; GENERIC-LABEL: test_vfmsq_laneq_f64:
706 ; GENERIC: // %bb.0: // %entry
707 ; GENERIC-NEXT: fmls v0.2d, v1.2d, v2.d[1]
710 ; EXYNOSM1-LABEL: test_vfmsq_laneq_f64:
711 ; EXYNOSM1: // %bb.0: // %entry
712 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[1]
713 ; EXYNOSM1-NEXT: fmls v0.2d, v1.2d, v2.2d
716 ; EXYNOSM3-LABEL: test_vfmsq_laneq_f64:
717 ; EXYNOSM3: // %bb.0: // %entry
718 ; EXYNOSM3-NEXT: fmls v0.2d, v1.2d, v2.d[1]
721 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
722 %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
723 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
727 define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
728 ; CHECK-LABEL: test_vfmas_laneq_f32:
729 ; CHECK: // %bb.0: // %entry
730 ; CHECK-NEXT: fmla s0, s1, v2.s[3]
733 %extract = extractelement <4 x float> %v, i32 3
734 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
738 declare float @llvm.fma.f32(float, float, float)
740 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
741 ; CHECK-LABEL: test_vfmsd_lane_f64:
742 ; CHECK: // %bb.0: // %entry
743 ; CHECK-NEXT: fmsub d0, d1, d2, d0
746 %extract.rhs = extractelement <1 x double> %v, i32 0
747 %extract = fsub double -0.000000e+00, %extract.rhs
748 %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
752 declare double @llvm.fma.f64(double, double, double)
754 define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
755 ; CHECK-LABEL: test_vfmss_lane_f32:
756 ; CHECK: // %bb.0: // %entry
757 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
758 ; CHECK-NEXT: fmls s0, s1, v2.s[1]
761 %extract.rhs = extractelement <2 x float> %v, i32 1
762 %extract = fsub float -0.000000e+00, %extract.rhs
763 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
767 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
768 ; CHECK-LABEL: test_vfmss_laneq_f32:
769 ; CHECK: // %bb.0: // %entry
770 ; CHECK-NEXT: fmls s0, s1, v2.s[3]
773 %extract.rhs = extractelement <4 x float> %v, i32 3
774 %extract = fsub float -0.000000e+00, %extract.rhs
775 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
779 define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
780 ; CHECK-LABEL: test_vfmsd_laneq_f64:
781 ; CHECK: // %bb.0: // %entry
782 ; CHECK-NEXT: fmls d0, d1, v2.d[1]
785 %extract.rhs = extractelement <2 x double> %v, i32 1
786 %extract = fsub double -0.000000e+00, %extract.rhs
787 %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
791 define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
792 ; CHECK-LABEL: test_vfmsd_lane_f64_0:
793 ; CHECK: // %bb.0: // %entry
794 ; CHECK-NEXT: fmsub d0, d1, d2, d0
797 %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
798 %tmp1 = extractelement <1 x double> %tmp0, i32 0
799 %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
803 define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
804 ; CHECK-LABEL: test_vfmss_lane_f32_0:
805 ; CHECK: // %bb.0: // %entry
806 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
807 ; CHECK-NEXT: fmls s0, s1, v2.s[1]
810 %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
811 %tmp1 = extractelement <2 x float> %tmp0, i32 1
812 %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
816 define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
817 ; CHECK-LABEL: test_vfmss_laneq_f32_0:
818 ; CHECK: // %bb.0: // %entry
819 ; CHECK-NEXT: fmls s0, s1, v2.s[3]
822 %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
823 %tmp1 = extractelement <4 x float> %tmp0, i32 3
824 %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
828 define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
829 ; CHECK-LABEL: test_vfmsd_laneq_f64_0:
830 ; CHECK: // %bb.0: // %entry
831 ; CHECK-NEXT: fmls d0, d1, v2.d[1]
834 %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
835 %tmp1 = extractelement <2 x double> %tmp0, i32 1
836 %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
840 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
841 ; CHECK-LABEL: test_vmlal_lane_s16:
842 ; CHECK: // %bb.0: // %entry
843 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
844 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[3]
847 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
848 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
849 %add = add <4 x i32> %vmull2.i, %a
853 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
854 ; CHECK-LABEL: test_vmlal_lane_s32:
855 ; CHECK: // %bb.0: // %entry
856 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
857 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[1]
860 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
861 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
862 %add = add <2 x i64> %vmull2.i, %a
866 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
867 ; CHECK-LABEL: test_vmlal_laneq_s16:
868 ; CHECK: // %bb.0: // %entry
869 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[7]
872 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
873 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
874 %add = add <4 x i32> %vmull2.i, %a
878 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
879 ; CHECK-LABEL: test_vmlal_laneq_s32:
880 ; CHECK: // %bb.0: // %entry
881 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[3]
884 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
885 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
886 %add = add <2 x i64> %vmull2.i, %a
890 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
891 ; CHECK-LABEL: test_vmlal_high_lane_s16:
892 ; CHECK: // %bb.0: // %entry
893 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
894 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.h[3]
897 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
898 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
899 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
900 %add = add <4 x i32> %vmull2.i, %a
904 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
905 ; CHECK-LABEL: test_vmlal_high_lane_s32:
906 ; CHECK: // %bb.0: // %entry
907 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
908 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.s[1]
911 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
912 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
913 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
914 %add = add <2 x i64> %vmull2.i, %a
918 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
919 ; CHECK-LABEL: test_vmlal_high_laneq_s16:
920 ; CHECK: // %bb.0: // %entry
921 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.h[7]
924 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
925 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
926 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
927 %add = add <4 x i32> %vmull2.i, %a
931 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
932 ; CHECK-LABEL: test_vmlal_high_laneq_s32:
933 ; CHECK: // %bb.0: // %entry
934 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.s[3]
937 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
938 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
939 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
940 %add = add <2 x i64> %vmull2.i, %a
944 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
945 ; CHECK-LABEL: test_vmlsl_lane_s16:
946 ; CHECK: // %bb.0: // %entry
947 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
948 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[3]
951 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
952 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
953 %sub = sub <4 x i32> %a, %vmull2.i
957 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
958 ; CHECK-LABEL: test_vmlsl_lane_s32:
959 ; CHECK: // %bb.0: // %entry
960 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
961 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[1]
964 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
965 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
966 %sub = sub <2 x i64> %a, %vmull2.i
970 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
971 ; CHECK-LABEL: test_vmlsl_laneq_s16:
972 ; CHECK: // %bb.0: // %entry
973 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[7]
976 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
977 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
978 %sub = sub <4 x i32> %a, %vmull2.i
982 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
983 ; CHECK-LABEL: test_vmlsl_laneq_s32:
984 ; CHECK: // %bb.0: // %entry
985 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[3]
988 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
989 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
990 %sub = sub <2 x i64> %a, %vmull2.i
994 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
995 ; CHECK-LABEL: test_vmlsl_high_lane_s16:
996 ; CHECK: // %bb.0: // %entry
997 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
998 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.h[3]
1001 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1002 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1003 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1004 %sub = sub <4 x i32> %a, %vmull2.i
1008 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1009 ; CHECK-LABEL: test_vmlsl_high_lane_s32:
1010 ; CHECK: // %bb.0: // %entry
1011 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1012 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.s[1]
1015 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1016 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1017 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1018 %sub = sub <2 x i64> %a, %vmull2.i
1022 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
1023 ; CHECK-LABEL: test_vmlsl_high_laneq_s16:
1024 ; CHECK: // %bb.0: // %entry
1025 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.h[7]
1028 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1029 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1030 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1031 %sub = sub <4 x i32> %a, %vmull2.i
1035 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
1036 ; CHECK-LABEL: test_vmlsl_high_laneq_s32:
1037 ; CHECK: // %bb.0: // %entry
1038 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.s[3]
1041 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1042 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1043 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1044 %sub = sub <2 x i64> %a, %vmull2.i
1048 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1049 ; CHECK-LABEL: test_vmlal_lane_u16:
1050 ; CHECK: // %bb.0: // %entry
1051 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1052 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[3]
1055 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1056 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1057 %add = add <4 x i32> %vmull2.i, %a
1061 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1062 ; CHECK-LABEL: test_vmlal_lane_u32:
1063 ; CHECK: // %bb.0: // %entry
1064 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1065 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[1]
1068 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1069 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1070 %add = add <2 x i64> %vmull2.i, %a
1074 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
1075 ; CHECK-LABEL: test_vmlal_laneq_u16:
1076 ; CHECK: // %bb.0: // %entry
1077 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[7]
1080 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1081 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1082 %add = add <4 x i32> %vmull2.i, %a
1086 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
1087 ; CHECK-LABEL: test_vmlal_laneq_u32:
1088 ; CHECK: // %bb.0: // %entry
1089 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[3]
1092 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1093 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1094 %add = add <2 x i64> %vmull2.i, %a
1098 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1099 ; CHECK-LABEL: test_vmlal_high_lane_u16:
1100 ; CHECK: // %bb.0: // %entry
1101 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1102 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.h[3]
1105 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1106 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1107 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1108 %add = add <4 x i32> %vmull2.i, %a
1112 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1113 ; CHECK-LABEL: test_vmlal_high_lane_u32:
1114 ; CHECK: // %bb.0: // %entry
1115 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1116 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.s[1]
1119 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1120 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1121 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1122 %add = add <2 x i64> %vmull2.i, %a
1126 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
1127 ; CHECK-LABEL: test_vmlal_high_laneq_u16:
1128 ; CHECK: // %bb.0: // %entry
1129 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.h[7]
1132 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1133 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1134 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1135 %add = add <4 x i32> %vmull2.i, %a
1139 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
1140 ; CHECK-LABEL: test_vmlal_high_laneq_u32:
1141 ; CHECK: // %bb.0: // %entry
1142 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.s[3]
1145 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1146 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1147 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1148 %add = add <2 x i64> %vmull2.i, %a
1152 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1153 ; CHECK-LABEL: test_vmlsl_lane_u16:
1154 ; CHECK: // %bb.0: // %entry
1155 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1156 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[3]
1159 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1160 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1161 %sub = sub <4 x i32> %a, %vmull2.i
1165 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1166 ; CHECK-LABEL: test_vmlsl_lane_u32:
1167 ; CHECK: // %bb.0: // %entry
1168 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1169 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[1]
1172 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1173 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1174 %sub = sub <2 x i64> %a, %vmull2.i
1178 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
1179 ; CHECK-LABEL: test_vmlsl_laneq_u16:
1180 ; CHECK: // %bb.0: // %entry
1181 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[7]
1184 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1185 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1186 %sub = sub <4 x i32> %a, %vmull2.i
1190 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
1191 ; CHECK-LABEL: test_vmlsl_laneq_u32:
1192 ; CHECK: // %bb.0: // %entry
1193 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[3]
1196 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1197 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1198 %sub = sub <2 x i64> %a, %vmull2.i
1202 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1203 ; CHECK-LABEL: test_vmlsl_high_lane_u16:
1204 ; CHECK: // %bb.0: // %entry
1205 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1206 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.h[3]
1209 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1210 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1211 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1212 %sub = sub <4 x i32> %a, %vmull2.i
1216 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1217 ; CHECK-LABEL: test_vmlsl_high_lane_u32:
1218 ; CHECK: // %bb.0: // %entry
1219 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1220 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.s[1]
1223 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1224 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1225 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1226 %sub = sub <2 x i64> %a, %vmull2.i
1230 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
1231 ; CHECK-LABEL: test_vmlsl_high_laneq_u16:
1232 ; CHECK: // %bb.0: // %entry
1233 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.h[7]
1236 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1237 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1238 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1239 %sub = sub <4 x i32> %a, %vmull2.i
1243 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
1244 ; CHECK-LABEL: test_vmlsl_high_laneq_u32:
1245 ; CHECK: // %bb.0: // %entry
1246 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.s[3]
1249 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1250 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1251 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1252 %sub = sub <2 x i64> %a, %vmull2.i
1256 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1257 ; CHECK-LABEL: test_vmull_lane_s16:
1258 ; CHECK: // %bb.0: // %entry
1259 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1260 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[3]
1263 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1264 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1265 ret <4 x i32> %vmull2.i
1268 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1269 ; CHECK-LABEL: test_vmull_lane_s32:
1270 ; CHECK: // %bb.0: // %entry
1271 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1272 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[1]
1275 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1276 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1277 ret <2 x i64> %vmull2.i
1280 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
1281 ; CHECK-LABEL: test_vmull_lane_u16:
1282 ; CHECK: // %bb.0: // %entry
1283 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1284 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[3]
1287 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1288 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1289 ret <4 x i32> %vmull2.i
1292 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
1293 ; CHECK-LABEL: test_vmull_lane_u32:
1294 ; CHECK: // %bb.0: // %entry
1295 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1296 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[1]
1299 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1300 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1301 ret <2 x i64> %vmull2.i
1304 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1305 ; CHECK-LABEL: test_vmull_high_lane_s16:
1306 ; CHECK: // %bb.0: // %entry
1307 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1308 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.h[3]
1311 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1312 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1313 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1314 ret <4 x i32> %vmull2.i
1317 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1318 ; CHECK-LABEL: test_vmull_high_lane_s32:
1319 ; CHECK: // %bb.0: // %entry
1320 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1321 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.s[1]
1324 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1325 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1326 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1327 ret <2 x i64> %vmull2.i
1330 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
1331 ; CHECK-LABEL: test_vmull_high_lane_u16:
1332 ; CHECK: // %bb.0: // %entry
1333 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1334 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.h[3]
1337 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1338 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1339 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1340 ret <4 x i32> %vmull2.i
1343 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
1344 ; CHECK-LABEL: test_vmull_high_lane_u32:
1345 ; CHECK: // %bb.0: // %entry
1346 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1347 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.s[1]
1350 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1351 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1352 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1353 ret <2 x i64> %vmull2.i
1356 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1357 ; CHECK-LABEL: test_vmull_laneq_s16:
1358 ; CHECK: // %bb.0: // %entry
1359 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[7]
1362 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1363 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1364 ret <4 x i32> %vmull2.i
1367 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1368 ; CHECK-LABEL: test_vmull_laneq_s32:
1369 ; CHECK: // %bb.0: // %entry
1370 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[3]
1373 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1374 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1375 ret <2 x i64> %vmull2.i
1378 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
1379 ; CHECK-LABEL: test_vmull_laneq_u16:
1380 ; CHECK: // %bb.0: // %entry
1381 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[7]
1384 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1385 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1386 ret <4 x i32> %vmull2.i
1389 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
1390 ; CHECK-LABEL: test_vmull_laneq_u32:
1391 ; CHECK: // %bb.0: // %entry
1392 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[3]
1395 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1396 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1397 ret <2 x i64> %vmull2.i
1400 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1401 ; CHECK-LABEL: test_vmull_high_laneq_s16:
1402 ; CHECK: // %bb.0: // %entry
1403 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.h[7]
1406 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1407 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1408 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1409 ret <4 x i32> %vmull2.i
1412 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1413 ; CHECK-LABEL: test_vmull_high_laneq_s32:
1414 ; CHECK: // %bb.0: // %entry
1415 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.s[3]
1418 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1419 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1420 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1421 ret <2 x i64> %vmull2.i
1424 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
1425 ; CHECK-LABEL: test_vmull_high_laneq_u16:
1426 ; CHECK: // %bb.0: // %entry
1427 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.h[7]
1430 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1431 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1432 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1433 ret <4 x i32> %vmull2.i
1436 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
1437 ; CHECK-LABEL: test_vmull_high_laneq_u32:
1438 ; CHECK: // %bb.0: // %entry
1439 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.s[3]
1442 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1443 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1444 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1445 ret <2 x i64> %vmull2.i
1448 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1449 ; CHECK-LABEL: test_vqdmlal_lane_s16:
1450 ; CHECK: // %bb.0: // %entry
1451 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1452 ; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.h[3]
1455 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1456 %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1457 %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1458 ret <4 x i32> %vqdmlal4.i
1461 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1462 ; CHECK-LABEL: test_vqdmlal_lane_s32:
1463 ; CHECK: // %bb.0: // %entry
1464 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1465 ; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[1]
1468 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1469 %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1470 %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1471 ret <2 x i64> %vqdmlal4.i
1474 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1475 ; CHECK-LABEL: test_vqdmlal_high_lane_s16:
1476 ; CHECK: // %bb.0: // %entry
1477 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1478 ; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.h[3]
1481 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1482 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1483 %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1484 %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1485 ret <4 x i32> %vqdmlal4.i
1488 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1489 ; CHECK-LABEL: test_vqdmlal_high_lane_s32:
1490 ; CHECK: // %bb.0: // %entry
1491 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1492 ; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[1]
1495 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1496 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1497 %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1498 %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1499 ret <2 x i64> %vqdmlal4.i
1502 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1503 ; CHECK-LABEL: test_vqdmlsl_lane_s16:
1504 ; CHECK: // %bb.0: // %entry
1505 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1506 ; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[3]
1509 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1510 %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1511 %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1512 ret <4 x i32> %vqdmlsl4.i
1515 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1516 ; CHECK-LABEL: test_vqdmlsl_lane_s32:
1517 ; CHECK: // %bb.0: // %entry
1518 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1519 ; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[1]
1522 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1523 %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1524 %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1525 ret <2 x i64> %vqdmlsl4.i
1528 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1529 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
1530 ; CHECK: // %bb.0: // %entry
1531 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1532 ; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.h[3]
1535 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1536 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1537 %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1538 %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1539 ret <4 x i32> %vqdmlsl4.i
1542 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1543 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
1544 ; CHECK: // %bb.0: // %entry
1545 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
1546 ; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.s[1]
1549 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1550 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1551 %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1552 %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1553 ret <2 x i64> %vqdmlsl4.i
1556 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1557 ; CHECK-LABEL: test_vqdmull_lane_s16:
1558 ; CHECK: // %bb.0: // %entry
1559 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1560 ; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[3]
1563 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1564 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1565 ret <4 x i32> %vqdmull2.i
1568 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1569 ; CHECK-LABEL: test_vqdmull_lane_s32:
1570 ; CHECK: // %bb.0: // %entry
1571 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1572 ; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[1]
1575 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1576 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1577 ret <2 x i64> %vqdmull2.i
1580 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1581 ; CHECK-LABEL: test_vqdmull_laneq_s16:
1582 ; CHECK: // %bb.0: // %entry
1583 ; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[3]
1586 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1587 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1588 ret <4 x i32> %vqdmull2.i
1591 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1592 ; CHECK-LABEL: test_vqdmull_laneq_s32:
1593 ; CHECK: // %bb.0: // %entry
1594 ; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[3]
1597 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1598 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1599 ret <2 x i64> %vqdmull2.i
1602 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1603 ; CHECK-LABEL: test_vqdmull_high_lane_s16:
1604 ; CHECK: // %bb.0: // %entry
1605 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1606 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[3]
1609 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1610 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1611 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1612 ret <4 x i32> %vqdmull2.i
1615 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1616 ; CHECK-LABEL: test_vqdmull_high_lane_s32:
1617 ; CHECK: // %bb.0: // %entry
1618 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1619 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[1]
1622 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1623 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1624 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1625 ret <2 x i64> %vqdmull2.i
1628 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1629 ; CHECK-LABEL: test_vqdmull_high_laneq_s16:
1630 ; CHECK: // %bb.0: // %entry
1631 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[7]
1634 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1635 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1636 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1637 ret <4 x i32> %vqdmull2.i
1640 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1641 ; CHECK-LABEL: test_vqdmull_high_laneq_s32:
1642 ; CHECK: // %bb.0: // %entry
1643 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[3]
1646 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1647 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1648 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1649 ret <2 x i64> %vqdmull2.i
1652 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1653 ; CHECK-LABEL: test_vqdmulh_lane_s16:
1654 ; CHECK: // %bb.0: // %entry
1655 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1656 ; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
1659 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1660 %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1661 ret <4 x i16> %vqdmulh2.i
1664 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1665 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
1666 ; CHECK: // %bb.0: // %entry
1667 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1668 ; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
1671 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1672 %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1673 ret <8 x i16> %vqdmulh2.i
1676 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1677 ; CHECK-LABEL: test_vqdmulh_lane_s32:
1678 ; CHECK: // %bb.0: // %entry
1679 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1680 ; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
1683 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1684 %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1685 ret <2 x i32> %vqdmulh2.i
1688 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1689 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
1690 ; CHECK: // %bb.0: // %entry
1691 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1692 ; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
1695 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1696 %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1697 ret <4 x i32> %vqdmulh2.i
1700 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1701 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
1702 ; CHECK: // %bb.0: // %entry
1703 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1704 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
1707 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1708 %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1709 ret <4 x i16> %vqrdmulh2.i
1712 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1713 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
1714 ; CHECK: // %bb.0: // %entry
1715 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1716 ; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
1719 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1720 %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1721 ret <8 x i16> %vqrdmulh2.i
1724 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1725 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
1726 ; CHECK: // %bb.0: // %entry
1727 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1728 ; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
1731 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1732 %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1733 ret <2 x i32> %vqrdmulh2.i
1736 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1737 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
1738 ; CHECK: // %bb.0: // %entry
1739 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1740 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
1743 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1744 %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1745 ret <4 x i32> %vqrdmulh2.i
1748 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
1749 ; GENERIC-LABEL: test_vmul_lane_f32:
1750 ; GENERIC: // %bb.0: // %entry
1751 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1752 ; GENERIC-NEXT: fmul v0.2s, v0.2s, v1.s[1]
1755 ; EXYNOSM1-LABEL: test_vmul_lane_f32:
1756 ; EXYNOSM1: // %bb.0: // %entry
1757 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1758 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[1]
1759 ; EXYNOSM1-NEXT: fmul v0.2s, v0.2s, v1.2s
1760 ; EXYNOSM1-NEXT: ret
1762 ; EXYNOSM3-LABEL: test_vmul_lane_f32:
1763 ; EXYNOSM3: // %bb.0: // %entry
1764 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1765 ; EXYNOSM3-NEXT: fmul v0.2s, v0.2s, v1.s[1]
1766 ; EXYNOSM3-NEXT: ret
1768 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1769 %mul = fmul <2 x float> %shuffle, %a
1770 ret <2 x float> %mul
1773 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
1774 ; CHECK-LABEL: test_vmul_lane_f64:
1775 ; CHECK: // %bb.0: // %entry
1776 ; CHECK-NEXT: fmul d0, d0, d1
1779 %0 = bitcast <1 x double> %a to <8 x i8>
1780 %1 = bitcast <8 x i8> %0 to double
1781 %extract = extractelement <1 x double> %v, i32 0
1782 %2 = fmul double %1, %extract
1783 %3 = insertelement <1 x double> undef, double %2, i32 0
1787 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
1788 ; GENERIC-LABEL: test_vmulq_lane_f32:
1789 ; GENERIC: // %bb.0: // %entry
1790 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1791 ; GENERIC-NEXT: fmul v0.4s, v0.4s, v1.s[1]
1794 ; EXYNOSM1-LABEL: test_vmulq_lane_f32:
1795 ; EXYNOSM1: // %bb.0: // %entry
1796 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1797 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[1]
1798 ; EXYNOSM1-NEXT: fmul v0.4s, v0.4s, v1.4s
1799 ; EXYNOSM1-NEXT: ret
1801 ; EXYNOSM3-LABEL: test_vmulq_lane_f32:
1802 ; EXYNOSM3: // %bb.0: // %entry
1803 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1804 ; EXYNOSM3-NEXT: fmul v0.4s, v0.4s, v1.s[1]
1805 ; EXYNOSM3-NEXT: ret
1807 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1808 %mul = fmul <4 x float> %shuffle, %a
1809 ret <4 x float> %mul
1812 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
1813 ; GENERIC-LABEL: test_vmulq_lane_f64:
1814 ; GENERIC: // %bb.0: // %entry
1815 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1816 ; GENERIC-NEXT: fmul v0.2d, v0.2d, v1.d[0]
1819 ; EXYNOSM1-LABEL: test_vmulq_lane_f64:
1820 ; EXYNOSM1: // %bb.0: // %entry
1821 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1822 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[0]
1823 ; EXYNOSM1-NEXT: fmul v0.2d, v0.2d, v1.2d
1824 ; EXYNOSM1-NEXT: ret
1826 ; EXYNOSM3-LABEL: test_vmulq_lane_f64:
1827 ; EXYNOSM3: // %bb.0: // %entry
1828 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1829 ; EXYNOSM3-NEXT: fmul v0.2d, v0.2d, v1.d[0]
1830 ; EXYNOSM3-NEXT: ret
1832 %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1833 %mul = fmul <2 x double> %shuffle, %a
1834 ret <2 x double> %mul
1837 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
1838 ; GENERIC-LABEL: test_vmul_laneq_f32:
1839 ; GENERIC: // %bb.0: // %entry
1840 ; GENERIC-NEXT: fmul v0.2s, v0.2s, v1.s[3]
1843 ; EXYNOSM1-LABEL: test_vmul_laneq_f32:
1844 ; EXYNOSM1: // %bb.0: // %entry
1845 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[3]
1846 ; EXYNOSM1-NEXT: fmul v0.2s, v0.2s, v1.2s
1847 ; EXYNOSM1-NEXT: ret
1849 ; EXYNOSM3-LABEL: test_vmul_laneq_f32:
1850 ; EXYNOSM3: // %bb.0: // %entry
1851 ; EXYNOSM3-NEXT: fmul v0.2s, v0.2s, v1.s[3]
1852 ; EXYNOSM3-NEXT: ret
1854 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1855 %mul = fmul <2 x float> %shuffle, %a
1856 ret <2 x float> %mul
1859 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
1860 ; CHECK-LABEL: test_vmul_laneq_f64:
1861 ; CHECK: // %bb.0: // %entry
1862 ; CHECK-NEXT: fmul d0, d0, v1.d[1]
1865 %0 = bitcast <1 x double> %a to <8 x i8>
1866 %1 = bitcast <8 x i8> %0 to double
1867 %extract = extractelement <2 x double> %v, i32 1
1868 %2 = fmul double %1, %extract
1869 %3 = insertelement <1 x double> undef, double %2, i32 0
1873 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
1874 ; GENERIC-LABEL: test_vmulq_laneq_f32:
1875 ; GENERIC: // %bb.0: // %entry
1876 ; GENERIC-NEXT: fmul v0.4s, v0.4s, v1.s[3]
1879 ; EXYNOSM1-LABEL: test_vmulq_laneq_f32:
1880 ; EXYNOSM1: // %bb.0: // %entry
1881 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[3]
1882 ; EXYNOSM1-NEXT: fmul v0.4s, v0.4s, v1.4s
1883 ; EXYNOSM1-NEXT: ret
1885 ; EXYNOSM3-LABEL: test_vmulq_laneq_f32:
1886 ; EXYNOSM3: // %bb.0: // %entry
1887 ; EXYNOSM3-NEXT: fmul v0.4s, v0.4s, v1.s[3]
1888 ; EXYNOSM3-NEXT: ret
1890 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1891 %mul = fmul <4 x float> %shuffle, %a
1892 ret <4 x float> %mul
1895 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
1896 ; GENERIC-LABEL: test_vmulq_laneq_f64:
1897 ; GENERIC: // %bb.0: // %entry
1898 ; GENERIC-NEXT: fmul v0.2d, v0.2d, v1.d[1]
1901 ; EXYNOSM1-LABEL: test_vmulq_laneq_f64:
1902 ; EXYNOSM1: // %bb.0: // %entry
1903 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[1]
1904 ; EXYNOSM1-NEXT: fmul v0.2d, v0.2d, v1.2d
1905 ; EXYNOSM1-NEXT: ret
1907 ; EXYNOSM3-LABEL: test_vmulq_laneq_f64:
1908 ; EXYNOSM3: // %bb.0: // %entry
1909 ; EXYNOSM3-NEXT: fmul v0.2d, v0.2d, v1.d[1]
1910 ; EXYNOSM3-NEXT: ret
1912 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1913 %mul = fmul <2 x double> %shuffle, %a
1914 ret <2 x double> %mul
1917 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
1918 ; GENERIC-LABEL: test_vmulx_lane_f32:
1919 ; GENERIC: // %bb.0: // %entry
1920 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1921 ; GENERIC-NEXT: fmulx v0.2s, v0.2s, v1.s[1]
1924 ; EXYNOSM1-LABEL: test_vmulx_lane_f32:
1925 ; EXYNOSM1: // %bb.0: // %entry
1926 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1927 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[1]
1928 ; EXYNOSM1-NEXT: fmulx v0.2s, v0.2s, v1.2s
1929 ; EXYNOSM1-NEXT: ret
1931 ; EXYNOSM3-LABEL: test_vmulx_lane_f32:
1932 ; EXYNOSM3: // %bb.0: // %entry
1933 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1934 ; EXYNOSM3-NEXT: fmulx v0.2s, v0.2s, v1.s[1]
1935 ; EXYNOSM3-NEXT: ret
1937 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1938 %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
1939 ret <2 x float> %vmulx2.i
1942 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
1943 ; GENERIC-LABEL: test_vmulxq_lane_f32:
1944 ; GENERIC: // %bb.0: // %entry
1945 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1946 ; GENERIC-NEXT: fmulx v0.4s, v0.4s, v1.s[1]
1949 ; EXYNOSM1-LABEL: test_vmulxq_lane_f32:
1950 ; EXYNOSM1: // %bb.0: // %entry
1951 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1952 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[1]
1953 ; EXYNOSM1-NEXT: fmulx v0.4s, v0.4s, v1.4s
1954 ; EXYNOSM1-NEXT: ret
1956 ; EXYNOSM3-LABEL: test_vmulxq_lane_f32:
1957 ; EXYNOSM3: // %bb.0: // %entry
1958 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1959 ; EXYNOSM3-NEXT: fmulx v0.4s, v0.4s, v1.s[1]
1960 ; EXYNOSM3-NEXT: ret
1962 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1963 %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
1964 ret <4 x float> %vmulx2.i
1967 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
1968 ; GENERIC-LABEL: test_vmulxq_lane_f64:
1969 ; GENERIC: // %bb.0: // %entry
1970 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
1971 ; GENERIC-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
1974 ; EXYNOSM1-LABEL: test_vmulxq_lane_f64:
1975 ; EXYNOSM1: // %bb.0: // %entry
1976 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
1977 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[0]
1978 ; EXYNOSM1-NEXT: fmulx v0.2d, v0.2d, v1.2d
1979 ; EXYNOSM1-NEXT: ret
1981 ; EXYNOSM3-LABEL: test_vmulxq_lane_f64:
1982 ; EXYNOSM3: // %bb.0: // %entry
1983 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
1984 ; EXYNOSM3-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
1985 ; EXYNOSM3-NEXT: ret
1987 %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1988 %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
1989 ret <2 x double> %vmulx2.i
1992 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
1993 ; GENERIC-LABEL: test_vmulx_laneq_f32:
1994 ; GENERIC: // %bb.0: // %entry
1995 ; GENERIC-NEXT: fmulx v0.2s, v0.2s, v1.s[3]
1998 ; EXYNOSM1-LABEL: test_vmulx_laneq_f32:
1999 ; EXYNOSM1: // %bb.0: // %entry
2000 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[3]
2001 ; EXYNOSM1-NEXT: fmulx v0.2s, v0.2s, v1.2s
2002 ; EXYNOSM1-NEXT: ret
2004 ; EXYNOSM3-LABEL: test_vmulx_laneq_f32:
2005 ; EXYNOSM3: // %bb.0: // %entry
2006 ; EXYNOSM3-NEXT: fmulx v0.2s, v0.2s, v1.s[3]
2007 ; EXYNOSM3-NEXT: ret
2009 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2010 %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
2011 ret <2 x float> %vmulx2.i
2014 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
2015 ; GENERIC-LABEL: test_vmulxq_laneq_f32:
2016 ; GENERIC: // %bb.0: // %entry
2017 ; GENERIC-NEXT: fmulx v0.4s, v0.4s, v1.s[3]
2020 ; EXYNOSM1-LABEL: test_vmulxq_laneq_f32:
2021 ; EXYNOSM1: // %bb.0: // %entry
2022 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[3]
2023 ; EXYNOSM1-NEXT: fmulx v0.4s, v0.4s, v1.4s
2024 ; EXYNOSM1-NEXT: ret
2026 ; EXYNOSM3-LABEL: test_vmulxq_laneq_f32:
2027 ; EXYNOSM3: // %bb.0: // %entry
2028 ; EXYNOSM3-NEXT: fmulx v0.4s, v0.4s, v1.s[3]
2029 ; EXYNOSM3-NEXT: ret
2031 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2032 %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
2033 ret <4 x float> %vmulx2.i
2036 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
2037 ; GENERIC-LABEL: test_vmulxq_laneq_f64:
2038 ; GENERIC: // %bb.0: // %entry
2039 ; GENERIC-NEXT: fmulx v0.2d, v0.2d, v1.d[1]
2042 ; EXYNOSM1-LABEL: test_vmulxq_laneq_f64:
2043 ; EXYNOSM1: // %bb.0: // %entry
2044 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[1]
2045 ; EXYNOSM1-NEXT: fmulx v0.2d, v0.2d, v1.2d
2046 ; EXYNOSM1-NEXT: ret
2048 ; EXYNOSM3-LABEL: test_vmulxq_laneq_f64:
2049 ; EXYNOSM3: // %bb.0: // %entry
2050 ; EXYNOSM3-NEXT: fmulx v0.2d, v0.2d, v1.d[1]
2051 ; EXYNOSM3-NEXT: ret
2053 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2054 %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
2055 ret <2 x double> %vmulx2.i
2058 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
2059 ; CHECK-LABEL: test_vmla_lane_s16_0:
2060 ; CHECK: // %bb.0: // %entry
2061 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2062 ; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[0]
2065 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2066 %mul = mul <4 x i16> %shuffle, %b
2067 %add = add <4 x i16> %mul, %a
2071 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
2072 ; CHECK-LABEL: test_vmlaq_lane_s16_0:
2073 ; CHECK: // %bb.0: // %entry
2074 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2075 ; CHECK-NEXT: mla v0.8h, v1.8h, v2.h[0]
2078 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2079 %mul = mul <8 x i16> %shuffle, %b
2080 %add = add <8 x i16> %mul, %a
2084 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
2085 ; CHECK-LABEL: test_vmla_lane_s32_0:
2086 ; CHECK: // %bb.0: // %entry
2087 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2088 ; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[0]
2091 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2092 %mul = mul <2 x i32> %shuffle, %b
2093 %add = add <2 x i32> %mul, %a
2097 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
2098 ; CHECK-LABEL: test_vmlaq_lane_s32_0:
2099 ; CHECK: // %bb.0: // %entry
2100 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2101 ; CHECK-NEXT: mla v0.4s, v1.4s, v2.s[0]
2104 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2105 %mul = mul <4 x i32> %shuffle, %b
2106 %add = add <4 x i32> %mul, %a
2110 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
2111 ; CHECK-LABEL: test_vmla_laneq_s16_0:
2112 ; CHECK: // %bb.0: // %entry
2113 ; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[0]
2116 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2117 %mul = mul <4 x i16> %shuffle, %b
2118 %add = add <4 x i16> %mul, %a
2122 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
2123 ; CHECK-LABEL: test_vmlaq_laneq_s16_0:
2124 ; CHECK: // %bb.0: // %entry
2125 ; CHECK-NEXT: mla v0.8h, v1.8h, v2.h[0]
2128 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2129 %mul = mul <8 x i16> %shuffle, %b
2130 %add = add <8 x i16> %mul, %a
2134 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
2135 ; CHECK-LABEL: test_vmla_laneq_s32_0:
2136 ; CHECK: // %bb.0: // %entry
2137 ; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[0]
2140 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2141 %mul = mul <2 x i32> %shuffle, %b
2142 %add = add <2 x i32> %mul, %a
2146 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
2147 ; CHECK-LABEL: test_vmlaq_laneq_s32_0:
2148 ; CHECK: // %bb.0: // %entry
2149 ; CHECK-NEXT: mla v0.4s, v1.4s, v2.s[0]
2152 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2153 %mul = mul <4 x i32> %shuffle, %b
2154 %add = add <4 x i32> %mul, %a
2158 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
2159 ; CHECK-LABEL: test_vmls_lane_s16_0:
2160 ; CHECK: // %bb.0: // %entry
2161 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2162 ; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[0]
2165 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2166 %mul = mul <4 x i16> %shuffle, %b
2167 %sub = sub <4 x i16> %a, %mul
2171 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
2172 ; CHECK-LABEL: test_vmlsq_lane_s16_0:
2173 ; CHECK: // %bb.0: // %entry
2174 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2175 ; CHECK-NEXT: mls v0.8h, v1.8h, v2.h[0]
2178 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2179 %mul = mul <8 x i16> %shuffle, %b
2180 %sub = sub <8 x i16> %a, %mul
2184 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
2185 ; CHECK-LABEL: test_vmls_lane_s32_0:
2186 ; CHECK: // %bb.0: // %entry
2187 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2188 ; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[0]
2191 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2192 %mul = mul <2 x i32> %shuffle, %b
2193 %sub = sub <2 x i32> %a, %mul
2197 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
2198 ; CHECK-LABEL: test_vmlsq_lane_s32_0:
2199 ; CHECK: // %bb.0: // %entry
2200 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2201 ; CHECK-NEXT: mls v0.4s, v1.4s, v2.s[0]
2204 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2205 %mul = mul <4 x i32> %shuffle, %b
2206 %sub = sub <4 x i32> %a, %mul
2210 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
2211 ; CHECK-LABEL: test_vmls_laneq_s16_0:
2212 ; CHECK: // %bb.0: // %entry
2213 ; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[0]
2216 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2217 %mul = mul <4 x i16> %shuffle, %b
2218 %sub = sub <4 x i16> %a, %mul
2222 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
2223 ; CHECK-LABEL: test_vmlsq_laneq_s16_0:
2224 ; CHECK: // %bb.0: // %entry
2225 ; CHECK-NEXT: mls v0.8h, v1.8h, v2.h[0]
2228 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2229 %mul = mul <8 x i16> %shuffle, %b
2230 %sub = sub <8 x i16> %a, %mul
2234 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
2235 ; CHECK-LABEL: test_vmls_laneq_s32_0:
2236 ; CHECK: // %bb.0: // %entry
2237 ; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[0]
2240 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2241 %mul = mul <2 x i32> %shuffle, %b
2242 %sub = sub <2 x i32> %a, %mul
2246 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
2247 ; CHECK-LABEL: test_vmlsq_laneq_s32_0:
2248 ; CHECK: // %bb.0: // %entry
2249 ; CHECK-NEXT: mls v0.4s, v1.4s, v2.s[0]
2252 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2253 %mul = mul <4 x i32> %shuffle, %b
2254 %sub = sub <4 x i32> %a, %mul
2258 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2259 ; CHECK-LABEL: test_vmul_lane_s16_0:
2260 ; CHECK: // %bb.0: // %entry
2261 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2262 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0]
2265 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2266 %mul = mul <4 x i16> %shuffle, %a
2270 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2271 ; CHECK-LABEL: test_vmulq_lane_s16_0:
2272 ; CHECK: // %bb.0: // %entry
2273 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2274 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[0]
2277 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2278 %mul = mul <8 x i16> %shuffle, %a
2282 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2283 ; CHECK-LABEL: test_vmul_lane_s32_0:
2284 ; CHECK: // %bb.0: // %entry
2285 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2286 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0]
2289 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2290 %mul = mul <2 x i32> %shuffle, %a
2294 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2295 ; CHECK-LABEL: test_vmulq_lane_s32_0:
2296 ; CHECK: // %bb.0: // %entry
2297 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2298 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[0]
2301 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2302 %mul = mul <4 x i32> %shuffle, %a
2306 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
2307 ; CHECK-LABEL: test_vmul_lane_u16_0:
2308 ; CHECK: // %bb.0: // %entry
2309 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2310 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0]
2313 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2314 %mul = mul <4 x i16> %shuffle, %a
2318 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
2319 ; CHECK-LABEL: test_vmulq_lane_u16_0:
2320 ; CHECK: // %bb.0: // %entry
2321 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2322 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[0]
2325 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2326 %mul = mul <8 x i16> %shuffle, %a
2330 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
2331 ; CHECK-LABEL: test_vmul_lane_u32_0:
2332 ; CHECK: // %bb.0: // %entry
2333 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2334 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0]
2337 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2338 %mul = mul <2 x i32> %shuffle, %a
2342 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
2343 ; CHECK-LABEL: test_vmulq_lane_u32_0:
2344 ; CHECK: // %bb.0: // %entry
2345 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
2346 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[0]
2349 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2350 %mul = mul <4 x i32> %shuffle, %a
2354 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
2355 ; CHECK-LABEL: test_vmul_laneq_s16_0:
2356 ; CHECK: // %bb.0: // %entry
2357 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0]
2360 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2361 %mul = mul <4 x i16> %shuffle, %a
2365 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
2366 ; CHECK-LABEL: test_vmulq_laneq_s16_0:
2367 ; CHECK: // %bb.0: // %entry
2368 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[0]
2371 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2372 %mul = mul <8 x i16> %shuffle, %a
2376 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
2377 ; CHECK-LABEL: test_vmul_laneq_s32_0:
2378 ; CHECK: // %bb.0: // %entry
2379 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0]
2382 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2383 %mul = mul <2 x i32> %shuffle, %a
2387 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
2388 ; CHECK-LABEL: test_vmulq_laneq_s32_0:
2389 ; CHECK: // %bb.0: // %entry
2390 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[0]
2393 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2394 %mul = mul <4 x i32> %shuffle, %a
2398 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
2399 ; CHECK-LABEL: test_vmul_laneq_u16_0:
2400 ; CHECK: // %bb.0: // %entry
2401 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0]
2404 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2405 %mul = mul <4 x i16> %shuffle, %a
2409 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
2410 ; CHECK-LABEL: test_vmulq_laneq_u16_0:
2411 ; CHECK: // %bb.0: // %entry
2412 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.h[0]
2415 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
2416 %mul = mul <8 x i16> %shuffle, %a
2420 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
2421 ; CHECK-LABEL: test_vmul_laneq_u32_0:
2422 ; CHECK: // %bb.0: // %entry
2423 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0]
2426 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2427 %mul = mul <2 x i32> %shuffle, %a
2431 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
2432 ; CHECK-LABEL: test_vmulq_laneq_u32_0:
2433 ; CHECK: // %bb.0: // %entry
2434 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.s[0]
2437 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
2438 %mul = mul <4 x i32> %shuffle, %a
2442 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
2443 ; GENERIC-LABEL: test_vfma_lane_f32_0:
2444 ; GENERIC: // %bb.0: // %entry
2445 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
2446 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[0]
2449 ; EXYNOSM1-LABEL: test_vfma_lane_f32_0:
2450 ; EXYNOSM1: // %bb.0: // %entry
2451 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
2452 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[0]
2453 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
2454 ; EXYNOSM1-NEXT: ret
2456 ; EXYNOSM3-LABEL: test_vfma_lane_f32_0:
2457 ; EXYNOSM3: // %bb.0: // %entry
2458 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
2459 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[0]
2460 ; EXYNOSM3-NEXT: ret
2462 %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
2463 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2467 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
2468 ; GENERIC-LABEL: test_vfmaq_lane_f32_0:
2469 ; GENERIC: // %bb.0: // %entry
2470 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
2471 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v2.s[0]
2474 ; EXYNOSM1-LABEL: test_vfmaq_lane_f32_0:
2475 ; EXYNOSM1: // %bb.0: // %entry
2476 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
2477 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[0]
2478 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v2.4s
2479 ; EXYNOSM1-NEXT: ret
2481 ; EXYNOSM3-LABEL: test_vfmaq_lane_f32_0:
2482 ; EXYNOSM3: // %bb.0: // %entry
2483 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
2484 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v2.s[0]
2485 ; EXYNOSM3-NEXT: ret
2487 %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
2488 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2492 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
2493 ; GENERIC-LABEL: test_vfma_laneq_f32_0:
2494 ; GENERIC: // %bb.0: // %entry
2495 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[0]
2498 ; EXYNOSM1-LABEL: test_vfma_laneq_f32_0:
2499 ; EXYNOSM1: // %bb.0: // %entry
2500 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[0]
2501 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
2502 ; EXYNOSM1-NEXT: ret
2504 ; EXYNOSM3-LABEL: test_vfma_laneq_f32_0:
2505 ; EXYNOSM3: // %bb.0: // %entry
2506 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[0]
2507 ; EXYNOSM3-NEXT: ret
2509 %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
2510 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2514 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
2515 ; GENERIC-LABEL: test_vfmaq_laneq_f32_0:
2516 ; GENERIC: // %bb.0: // %entry
2517 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v2.s[0]
2520 ; EXYNOSM1-LABEL: test_vfmaq_laneq_f32_0:
2521 ; EXYNOSM1: // %bb.0: // %entry
2522 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[0]
2523 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v2.4s
2524 ; EXYNOSM1-NEXT: ret
2526 ; EXYNOSM3-LABEL: test_vfmaq_laneq_f32_0:
2527 ; EXYNOSM3: // %bb.0: // %entry
2528 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v2.s[0]
2529 ; EXYNOSM3-NEXT: ret
2531 %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
2532 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2536 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
2537 ; GENERIC-LABEL: test_vfms_lane_f32_0:
2538 ; GENERIC: // %bb.0: // %entry
2539 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
2540 ; GENERIC-NEXT: fmls v0.2s, v1.2s, v2.s[0]
2543 ; EXYNOSM1-LABEL: test_vfms_lane_f32_0:
2544 ; EXYNOSM1: // %bb.0: // %entry
2545 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
2546 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[0]
2547 ; EXYNOSM1-NEXT: fmls v0.2s, v1.2s, v2.2s
2548 ; EXYNOSM1-NEXT: ret
2550 ; EXYNOSM3-LABEL: test_vfms_lane_f32_0:
2551 ; EXYNOSM3: // %bb.0: // %entry
2552 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
2553 ; EXYNOSM3-NEXT: fmls v0.2s, v1.2s, v2.s[0]
2554 ; EXYNOSM3-NEXT: ret
2556 %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
2557 %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
2558 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2562 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
2563 ; GENERIC-LABEL: test_vfmsq_lane_f32_0:
2564 ; GENERIC: // %bb.0: // %entry
2565 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
2566 ; GENERIC-NEXT: fmls v0.4s, v1.4s, v2.s[0]
2569 ; EXYNOSM1-LABEL: test_vfmsq_lane_f32_0:
2570 ; EXYNOSM1: // %bb.0: // %entry
2571 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
2572 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[0]
2573 ; EXYNOSM1-NEXT: fmls v0.4s, v1.4s, v2.4s
2574 ; EXYNOSM1-NEXT: ret
2576 ; EXYNOSM3-LABEL: test_vfmsq_lane_f32_0:
2577 ; EXYNOSM3: // %bb.0: // %entry
2578 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
2579 ; EXYNOSM3-NEXT: fmls v0.4s, v1.4s, v2.s[0]
2580 ; EXYNOSM3-NEXT: ret
2582 %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
2583 %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
2584 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2588 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
2589 ; GENERIC-LABEL: test_vfms_laneq_f32_0:
2590 ; GENERIC: // %bb.0: // %entry
2591 ; GENERIC-NEXT: fmls v0.2s, v1.2s, v2.s[0]
2594 ; EXYNOSM1-LABEL: test_vfms_laneq_f32_0:
2595 ; EXYNOSM1: // %bb.0: // %entry
2596 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[0]
2597 ; EXYNOSM1-NEXT: fmls v0.2s, v1.2s, v2.2s
2598 ; EXYNOSM1-NEXT: ret
2600 ; EXYNOSM3-LABEL: test_vfms_laneq_f32_0:
2601 ; EXYNOSM3: // %bb.0: // %entry
2602 ; EXYNOSM3-NEXT: fmls v0.2s, v1.2s, v2.s[0]
2603 ; EXYNOSM3-NEXT: ret
2605 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
2606 %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
2607 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2611 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
2612 ; GENERIC-LABEL: test_vfmsq_laneq_f32_0:
2613 ; GENERIC: // %bb.0: // %entry
2614 ; GENERIC-NEXT: fmls v0.4s, v1.4s, v2.s[0]
2617 ; EXYNOSM1-LABEL: test_vfmsq_laneq_f32_0:
2618 ; EXYNOSM1: // %bb.0: // %entry
2619 ; EXYNOSM1-NEXT: dup v2.4s, v2.s[0]
2620 ; EXYNOSM1-NEXT: fmls v0.4s, v1.4s, v2.4s
2621 ; EXYNOSM1-NEXT: ret
2623 ; EXYNOSM3-LABEL: test_vfmsq_laneq_f32_0:
2624 ; EXYNOSM3: // %bb.0: // %entry
2625 ; EXYNOSM3-NEXT: fmls v0.4s, v1.4s, v2.s[0]
2626 ; EXYNOSM3-NEXT: ret
2628 %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
2629 %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
2630 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
2634 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
2635 ; GENERIC-LABEL: test_vfmaq_laneq_f64_0:
2636 ; GENERIC: // %bb.0: // %entry
2637 ; GENERIC-NEXT: fmla v0.2d, v1.2d, v2.d[0]
2640 ; EXYNOSM1-LABEL: test_vfmaq_laneq_f64_0:
2641 ; EXYNOSM1: // %bb.0: // %entry
2642 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[0]
2643 ; EXYNOSM1-NEXT: fmla v0.2d, v1.2d, v2.2d
2644 ; EXYNOSM1-NEXT: ret
2646 ; EXYNOSM3-LABEL: test_vfmaq_laneq_f64_0:
2647 ; EXYNOSM3: // %bb.0: // %entry
2648 ; EXYNOSM3-NEXT: fmla v0.2d, v1.2d, v2.d[0]
2649 ; EXYNOSM3-NEXT: ret
2651 %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
2652 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
2656 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
2657 ; GENERIC-LABEL: test_vfmsq_laneq_f64_0:
2658 ; GENERIC: // %bb.0: // %entry
2659 ; GENERIC-NEXT: fmls v0.2d, v1.2d, v2.d[0]
2662 ; EXYNOSM1-LABEL: test_vfmsq_laneq_f64_0:
2663 ; EXYNOSM1: // %bb.0: // %entry
2664 ; EXYNOSM1-NEXT: dup v2.2d, v2.d[0]
2665 ; EXYNOSM1-NEXT: fmls v0.2d, v1.2d, v2.2d
2666 ; EXYNOSM1-NEXT: ret
2668 ; EXYNOSM3-LABEL: test_vfmsq_laneq_f64_0:
2669 ; EXYNOSM3: // %bb.0: // %entry
2670 ; EXYNOSM3-NEXT: fmls v0.2d, v1.2d, v2.d[0]
2671 ; EXYNOSM3-NEXT: ret
2673 %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
2674 %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
2675 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
2679 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2680 ; CHECK-LABEL: test_vmlal_lane_s16_0:
2681 ; CHECK: // %bb.0: // %entry
2682 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2683 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[0]
2686 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2687 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2688 %add = add <4 x i32> %vmull2.i, %a
2692 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2693 ; CHECK-LABEL: test_vmlal_lane_s32_0:
2694 ; CHECK: // %bb.0: // %entry
2695 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2696 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[0]
2699 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2700 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2701 %add = add <2 x i64> %vmull2.i, %a
2705 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2706 ; CHECK-LABEL: test_vmlal_laneq_s16_0:
2707 ; CHECK: // %bb.0: // %entry
2708 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[0]
2711 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2712 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2713 %add = add <4 x i32> %vmull2.i, %a
2717 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2718 ; CHECK-LABEL: test_vmlal_laneq_s32_0:
2719 ; CHECK: // %bb.0: // %entry
2720 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[0]
2723 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2724 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2725 %add = add <2 x i64> %vmull2.i, %a
2729 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2730 ; CHECK-LABEL: test_vmlal_high_lane_s16_0:
2731 ; CHECK: // %bb.0: // %entry
2732 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2733 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.h[0]
2736 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2737 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2738 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2739 %add = add <4 x i32> %vmull2.i, %a
2743 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2744 ; CHECK-LABEL: test_vmlal_high_lane_s32_0:
2745 ; CHECK: // %bb.0: // %entry
2746 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2747 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.s[0]
2750 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2751 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2752 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2753 %add = add <2 x i64> %vmull2.i, %a
2757 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2758 ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
2759 ; CHECK: // %bb.0: // %entry
2760 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.h[0]
2763 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2764 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2765 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2766 %add = add <4 x i32> %vmull2.i, %a
2770 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2771 ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
2772 ; CHECK: // %bb.0: // %entry
2773 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.s[0]
2776 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2777 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2778 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2779 %add = add <2 x i64> %vmull2.i, %a
2783 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2784 ; CHECK-LABEL: test_vmlsl_lane_s16_0:
2785 ; CHECK: // %bb.0: // %entry
2786 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2787 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[0]
2790 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2791 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2792 %sub = sub <4 x i32> %a, %vmull2.i
2796 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2797 ; CHECK-LABEL: test_vmlsl_lane_s32_0:
2798 ; CHECK: // %bb.0: // %entry
2799 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2800 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[0]
2803 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2804 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2805 %sub = sub <2 x i64> %a, %vmull2.i
2809 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2810 ; CHECK-LABEL: test_vmlsl_laneq_s16_0:
2811 ; CHECK: // %bb.0: // %entry
2812 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[0]
2815 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2816 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2817 %sub = sub <4 x i32> %a, %vmull2.i
2821 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2822 ; CHECK-LABEL: test_vmlsl_laneq_s32_0:
2823 ; CHECK: // %bb.0: // %entry
2824 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[0]
2827 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2828 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2829 %sub = sub <2 x i64> %a, %vmull2.i
2833 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2834 ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
2835 ; CHECK: // %bb.0: // %entry
2836 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2837 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.h[0]
2840 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2841 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2842 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2843 %sub = sub <4 x i32> %a, %vmull2.i
2847 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2848 ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
2849 ; CHECK: // %bb.0: // %entry
2850 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2851 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.s[0]
2854 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2855 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2856 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2857 %sub = sub <2 x i64> %a, %vmull2.i
2861 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2862 ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
2863 ; CHECK: // %bb.0: // %entry
2864 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.h[0]
2867 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2868 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2869 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2870 %sub = sub <4 x i32> %a, %vmull2.i
2874 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2875 ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
2876 ; CHECK: // %bb.0: // %entry
2877 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.s[0]
2880 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2881 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2882 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2883 %sub = sub <2 x i64> %a, %vmull2.i
2887 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2888 ; CHECK-LABEL: test_vmlal_lane_u16_0:
2889 ; CHECK: // %bb.0: // %entry
2890 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2891 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[0]
2894 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2895 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2896 %add = add <4 x i32> %vmull2.i, %a
2900 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2901 ; CHECK-LABEL: test_vmlal_lane_u32_0:
2902 ; CHECK: // %bb.0: // %entry
2903 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2904 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[0]
2907 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2908 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2909 %add = add <2 x i64> %vmull2.i, %a
2913 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2914 ; CHECK-LABEL: test_vmlal_laneq_u16_0:
2915 ; CHECK: // %bb.0: // %entry
2916 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[0]
2919 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2920 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2921 %add = add <4 x i32> %vmull2.i, %a
2925 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2926 ; CHECK-LABEL: test_vmlal_laneq_u32_0:
2927 ; CHECK: // %bb.0: // %entry
2928 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[0]
2931 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2932 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2933 %add = add <2 x i64> %vmull2.i, %a
2937 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2938 ; CHECK-LABEL: test_vmlal_high_lane_u16_0:
2939 ; CHECK: // %bb.0: // %entry
2940 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2941 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.h[0]
2944 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2945 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2946 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2947 %add = add <4 x i32> %vmull2.i, %a
2951 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2952 ; CHECK-LABEL: test_vmlal_high_lane_u32_0:
2953 ; CHECK: // %bb.0: // %entry
2954 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2955 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.s[0]
2958 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2959 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2960 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2961 %add = add <2 x i64> %vmull2.i, %a
2965 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2966 ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
2967 ; CHECK: // %bb.0: // %entry
2968 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.h[0]
2971 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2972 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2973 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2974 %add = add <4 x i32> %vmull2.i, %a
2978 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2979 ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
2980 ; CHECK: // %bb.0: // %entry
2981 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.s[0]
2984 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2985 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2986 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2987 %add = add <2 x i64> %vmull2.i, %a
2991 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2992 ; CHECK-LABEL: test_vmlsl_lane_u16_0:
2993 ; CHECK: // %bb.0: // %entry
2994 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
2995 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[0]
2998 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2999 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
3000 %sub = sub <4 x i32> %a, %vmull2.i
3004 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
3005 ; CHECK-LABEL: test_vmlsl_lane_u32_0:
3006 ; CHECK: // %bb.0: // %entry
3007 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3008 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[0]
3011 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3012 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
3013 %sub = sub <2 x i64> %a, %vmull2.i
3017 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
3018 ; CHECK-LABEL: test_vmlsl_laneq_u16_0:
3019 ; CHECK: // %bb.0: // %entry
3020 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[0]
3023 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3024 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
3025 %sub = sub <4 x i32> %a, %vmull2.i
3029 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
3030 ; CHECK-LABEL: test_vmlsl_laneq_u32_0:
3031 ; CHECK: // %bb.0: // %entry
3032 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[0]
3035 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3036 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
3037 %sub = sub <2 x i64> %a, %vmull2.i
3041 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
3042 ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
3043 ; CHECK: // %bb.0: // %entry
3044 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3045 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.h[0]
3048 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3049 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3050 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3051 %sub = sub <4 x i32> %a, %vmull2.i
3055 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
3056 ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
3057 ; CHECK: // %bb.0: // %entry
3058 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3059 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.s[0]
3062 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3063 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3064 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3065 %sub = sub <2 x i64> %a, %vmull2.i
3069 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
3070 ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
3071 ; CHECK: // %bb.0: // %entry
3072 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.h[0]
3075 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3076 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3077 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3078 %sub = sub <4 x i32> %a, %vmull2.i
3082 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
3083 ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
3084 ; CHECK: // %bb.0: // %entry
3085 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.s[0]
3088 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3089 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3090 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3091 %sub = sub <2 x i64> %a, %vmull2.i
3095 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3096 ; CHECK-LABEL: test_vmull_lane_s16_0:
3097 ; CHECK: // %bb.0: // %entry
3098 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3099 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[0]
3102 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3103 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3104 ret <4 x i32> %vmull2.i
3107 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3108 ; CHECK-LABEL: test_vmull_lane_s32_0:
3109 ; CHECK: // %bb.0: // %entry
3110 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3111 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[0]
3114 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3115 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3116 ret <2 x i64> %vmull2.i
3119 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
3120 ; CHECK-LABEL: test_vmull_lane_u16_0:
3121 ; CHECK: // %bb.0: // %entry
3122 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3123 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[0]
3126 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3127 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3128 ret <4 x i32> %vmull2.i
3131 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
3132 ; CHECK-LABEL: test_vmull_lane_u32_0:
3133 ; CHECK: // %bb.0: // %entry
3134 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3135 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[0]
3138 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3139 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3140 ret <2 x i64> %vmull2.i
3143 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3144 ; CHECK-LABEL: test_vmull_high_lane_s16_0:
3145 ; CHECK: // %bb.0: // %entry
3146 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3147 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.h[0]
3150 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3151 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3152 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3153 ret <4 x i32> %vmull2.i
3156 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3157 ; CHECK-LABEL: test_vmull_high_lane_s32_0:
3158 ; CHECK: // %bb.0: // %entry
3159 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3160 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.s[0]
3163 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3164 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3165 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3166 ret <2 x i64> %vmull2.i
3169 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
3170 ; CHECK-LABEL: test_vmull_high_lane_u16_0:
3171 ; CHECK: // %bb.0: // %entry
3172 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3173 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.h[0]
3176 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3177 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3178 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3179 ret <4 x i32> %vmull2.i
3182 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
3183 ; CHECK-LABEL: test_vmull_high_lane_u32_0:
3184 ; CHECK: // %bb.0: // %entry
3185 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3186 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.s[0]
3189 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3190 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3191 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3192 ret <2 x i64> %vmull2.i
3195 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
3196 ; CHECK-LABEL: test_vmull_laneq_s16_0:
3197 ; CHECK: // %bb.0: // %entry
3198 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[0]
3201 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3202 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3203 ret <4 x i32> %vmull2.i
3206 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
3207 ; CHECK-LABEL: test_vmull_laneq_s32_0:
3208 ; CHECK: // %bb.0: // %entry
3209 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[0]
3212 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3213 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3214 ret <2 x i64> %vmull2.i
3217 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
3218 ; CHECK-LABEL: test_vmull_laneq_u16_0:
3219 ; CHECK: // %bb.0: // %entry
3220 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[0]
3223 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3224 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3225 ret <4 x i32> %vmull2.i
3228 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
3229 ; CHECK-LABEL: test_vmull_laneq_u32_0:
3230 ; CHECK: // %bb.0: // %entry
3231 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[0]
3234 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3235 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3236 ret <2 x i64> %vmull2.i
3239 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
3240 ; CHECK-LABEL: test_vmull_high_laneq_s16_0:
3241 ; CHECK: // %bb.0: // %entry
3242 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.h[0]
3245 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3246 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3247 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3248 ret <4 x i32> %vmull2.i
3251 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
3252 ; CHECK-LABEL: test_vmull_high_laneq_s32_0:
3253 ; CHECK: // %bb.0: // %entry
3254 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.s[0]
3257 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3258 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3259 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3260 ret <2 x i64> %vmull2.i
3263 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
3264 ; CHECK-LABEL: test_vmull_high_laneq_u16_0:
3265 ; CHECK: // %bb.0: // %entry
3266 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.h[0]
3269 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3270 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3271 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3272 ret <4 x i32> %vmull2.i
3275 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
3276 ; CHECK-LABEL: test_vmull_high_laneq_u32_0:
3277 ; CHECK: // %bb.0: // %entry
3278 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.s[0]
3281 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3282 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3283 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3284 ret <2 x i64> %vmull2.i
3287 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
3288 ; CHECK-LABEL: test_vqdmlal_lane_s16_0:
3289 ; CHECK: // %bb.0: // %entry
3290 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3291 ; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.h[0]
3294 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3295 %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
3296 %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
3297 ret <4 x i32> %vqdmlal4.i
3300 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
3301 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
3302 ; CHECK: // %bb.0: // %entry
3303 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3304 ; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0]
3307 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3308 %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
3309 %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
3310 ret <2 x i64> %vqdmlal4.i
3313 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
3314 ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
3315 ; CHECK: // %bb.0: // %entry
3316 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3317 ; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.h[0]
3320 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3321 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3322 %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3323 %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
3324 ret <4 x i32> %vqdmlal4.i
3327 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
3328 ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
3329 ; CHECK: // %bb.0: // %entry
3330 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3331 ; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.s[0]
3334 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3335 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3336 %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3337 %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
3338 ret <2 x i64> %vqdmlal4.i
3341 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
3342 ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
3343 ; CHECK: // %bb.0: // %entry
3344 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3345 ; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.h[0]
3348 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3349 %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
3350 %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
3351 ret <4 x i32> %vqdmlsl4.i
3354 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
3355 ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
3356 ; CHECK: // %bb.0: // %entry
3357 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3358 ; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.s[0]
3361 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3362 %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
3363 %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
3364 ret <2 x i64> %vqdmlsl4.i
3367 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
3368 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
3369 ; CHECK: // %bb.0: // %entry
3370 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3371 ; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.h[0]
3374 %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3375 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3376 %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3377 %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
3378 ret <4 x i32> %vqdmlsl4.i
3381 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
3382 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
3383 ; CHECK: // %bb.0: // %entry
3384 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
3385 ; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.s[0]
3388 %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3389 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3390 %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3391 %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
3392 ret <2 x i64> %vqdmlsl4.i
3395 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3396 ; CHECK-LABEL: test_vqdmull_lane_s16_0:
3397 ; CHECK: // %bb.0: // %entry
3398 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3399 ; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[0]
3402 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3403 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3404 ret <4 x i32> %vqdmull2.i
3407 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3408 ; CHECK-LABEL: test_vqdmull_lane_s32_0:
3409 ; CHECK: // %bb.0: // %entry
3410 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3411 ; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[0]
3414 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3415 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3416 ret <2 x i64> %vqdmull2.i
3419 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
3420 ; CHECK-LABEL: test_vqdmull_laneq_s16_0:
3421 ; CHECK: // %bb.0: // %entry
3422 ; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[0]
3425 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3426 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
3427 ret <4 x i32> %vqdmull2.i
3430 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
3431 ; CHECK-LABEL: test_vqdmull_laneq_s32_0:
3432 ; CHECK: // %bb.0: // %entry
3433 ; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[0]
3436 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3437 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
3438 ret <2 x i64> %vqdmull2.i
3441 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3442 ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
3443 ; CHECK: // %bb.0: // %entry
3444 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3445 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[0]
3448 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3449 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3450 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3451 ret <4 x i32> %vqdmull2.i
3454 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3455 ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
3456 ; CHECK: // %bb.0: // %entry
3457 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3458 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[0]
3461 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3462 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3463 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3464 ret <2 x i64> %vqdmull2.i
3467 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
3468 ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
3469 ; CHECK: // %bb.0: // %entry
3470 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.h[0]
3473 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3474 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
3475 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
3476 ret <4 x i32> %vqdmull2.i
3479 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
3480 ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
3481 ; CHECK: // %bb.0: // %entry
3482 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.s[0]
3485 %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3486 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
3487 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
3488 ret <2 x i64> %vqdmull2.i
3491 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3492 ; CHECK-LABEL: test_vqdmulh_lane_s16_0:
3493 ; CHECK: // %bb.0: // %entry
3494 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3495 ; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[0]
3498 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3499 %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
3500 ret <4 x i16> %vqdmulh2.i
3503 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3504 ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
3505 ; CHECK: // %bb.0: // %entry
3506 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3507 ; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[0]
3510 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
3511 %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
3512 ret <8 x i16> %vqdmulh2.i
3515 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3516 ; CHECK-LABEL: test_vqdmulh_lane_s32_0:
3517 ; CHECK: // %bb.0: // %entry
3518 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3519 ; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[0]
3522 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3523 %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
3524 ret <2 x i32> %vqdmulh2.i
3527 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3528 ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
3529 ; CHECK: // %bb.0: // %entry
3530 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3531 ; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[0]
3534 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
3535 %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
3536 ret <4 x i32> %vqdmulh2.i
3539 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
3540 ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
3541 ; CHECK: // %bb.0: // %entry
3542 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3543 ; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[0]
3546 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
3547 %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
3548 ret <4 x i16> %vqrdmulh2.i
3551 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
3552 ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
3553 ; CHECK: // %bb.0: // %entry
3554 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3555 ; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[0]
3558 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
3559 %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
3560 ret <8 x i16> %vqrdmulh2.i
3563 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
3564 ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
3565 ; CHECK: // %bb.0: // %entry
3566 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3567 ; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0]
3570 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
3571 %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
3572 ret <2 x i32> %vqrdmulh2.i
3575 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
3576 ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
3577 ; CHECK: // %bb.0: // %entry
3578 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
3579 ; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0]
3582 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
3583 %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
3584 ret <4 x i32> %vqrdmulh2.i
3587 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
3588 ; GENERIC-LABEL: test_vmul_lane_f32_0:
3589 ; GENERIC: // %bb.0: // %entry
3590 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
3591 ; GENERIC-NEXT: fmul v0.2s, v0.2s, v1.s[0]
3594 ; EXYNOSM1-LABEL: test_vmul_lane_f32_0:
3595 ; EXYNOSM1: // %bb.0: // %entry
3596 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
3597 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[0]
3598 ; EXYNOSM1-NEXT: fmul v0.2s, v0.2s, v1.2s
3599 ; EXYNOSM1-NEXT: ret
3601 ; EXYNOSM3-LABEL: test_vmul_lane_f32_0:
3602 ; EXYNOSM3: // %bb.0: // %entry
3603 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
3604 ; EXYNOSM3-NEXT: fmul v0.2s, v0.2s, v1.s[0]
3605 ; EXYNOSM3-NEXT: ret
3607 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
3608 %mul = fmul <2 x float> %shuffle, %a
3609 ret <2 x float> %mul
3612 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
3613 ; GENERIC-LABEL: test_vmulq_lane_f32_0:
3614 ; GENERIC: // %bb.0: // %entry
3615 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
3616 ; GENERIC-NEXT: fmul v0.4s, v0.4s, v1.s[0]
3619 ; EXYNOSM1-LABEL: test_vmulq_lane_f32_0:
3620 ; EXYNOSM1: // %bb.0: // %entry
3621 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
3622 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[0]
3623 ; EXYNOSM1-NEXT: fmul v0.4s, v0.4s, v1.4s
3624 ; EXYNOSM1-NEXT: ret
3626 ; EXYNOSM3-LABEL: test_vmulq_lane_f32_0:
3627 ; EXYNOSM3: // %bb.0: // %entry
3628 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
3629 ; EXYNOSM3-NEXT: fmul v0.4s, v0.4s, v1.s[0]
3630 ; EXYNOSM3-NEXT: ret
3632 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
3633 %mul = fmul <4 x float> %shuffle, %a
3634 ret <4 x float> %mul
3637 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
3638 ; GENERIC-LABEL: test_vmul_laneq_f32_0:
3639 ; GENERIC: // %bb.0: // %entry
3640 ; GENERIC-NEXT: fmul v0.2s, v0.2s, v1.s[0]
3643 ; EXYNOSM1-LABEL: test_vmul_laneq_f32_0:
3644 ; EXYNOSM1: // %bb.0: // %entry
3645 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[0]
3646 ; EXYNOSM1-NEXT: fmul v0.2s, v0.2s, v1.2s
3647 ; EXYNOSM1-NEXT: ret
3649 ; EXYNOSM3-LABEL: test_vmul_laneq_f32_0:
3650 ; EXYNOSM3: // %bb.0: // %entry
3651 ; EXYNOSM3-NEXT: fmul v0.2s, v0.2s, v1.s[0]
3652 ; EXYNOSM3-NEXT: ret
3654 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
3655 %mul = fmul <2 x float> %shuffle, %a
3656 ret <2 x float> %mul
3659 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
3660 ; CHECK-LABEL: test_vmul_laneq_f64_0:
3661 ; CHECK: // %bb.0: // %entry
3662 ; CHECK-NEXT: fmul d0, d0, v1.d[0]
3665 %0 = bitcast <1 x double> %a to <8 x i8>
3666 %1 = bitcast <8 x i8> %0 to double
3667 %extract = extractelement <2 x double> %v, i32 0
3668 %2 = fmul double %1, %extract
3669 %3 = insertelement <1 x double> undef, double %2, i32 0
3673 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
3674 ; GENERIC-LABEL: test_vmulq_laneq_f32_0:
3675 ; GENERIC: // %bb.0: // %entry
3676 ; GENERIC-NEXT: fmul v0.4s, v0.4s, v1.s[0]
3679 ; EXYNOSM1-LABEL: test_vmulq_laneq_f32_0:
3680 ; EXYNOSM1: // %bb.0: // %entry
3681 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[0]
3682 ; EXYNOSM1-NEXT: fmul v0.4s, v0.4s, v1.4s
3683 ; EXYNOSM1-NEXT: ret
3685 ; EXYNOSM3-LABEL: test_vmulq_laneq_f32_0:
3686 ; EXYNOSM3: // %bb.0: // %entry
3687 ; EXYNOSM3-NEXT: fmul v0.4s, v0.4s, v1.s[0]
3688 ; EXYNOSM3-NEXT: ret
3690 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
3691 %mul = fmul <4 x float> %shuffle, %a
3692 ret <4 x float> %mul
3695 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
3696 ; GENERIC-LABEL: test_vmulq_laneq_f64_0:
3697 ; GENERIC: // %bb.0: // %entry
3698 ; GENERIC-NEXT: fmul v0.2d, v0.2d, v1.d[0]
3701 ; EXYNOSM1-LABEL: test_vmulq_laneq_f64_0:
3702 ; EXYNOSM1: // %bb.0: // %entry
3703 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[0]
3704 ; EXYNOSM1-NEXT: fmul v0.2d, v0.2d, v1.2d
3705 ; EXYNOSM1-NEXT: ret
3707 ; EXYNOSM3-LABEL: test_vmulq_laneq_f64_0:
3708 ; EXYNOSM3: // %bb.0: // %entry
3709 ; EXYNOSM3-NEXT: fmul v0.2d, v0.2d, v1.d[0]
3710 ; EXYNOSM3-NEXT: ret
3712 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
3713 %mul = fmul <2 x double> %shuffle, %a
3714 ret <2 x double> %mul
3717 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
3718 ; GENERIC-LABEL: test_vmulx_lane_f32_0:
3719 ; GENERIC: // %bb.0: // %entry
3720 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
3721 ; GENERIC-NEXT: fmulx v0.2s, v0.2s, v1.s[0]
3724 ; EXYNOSM1-LABEL: test_vmulx_lane_f32_0:
3725 ; EXYNOSM1: // %bb.0: // %entry
3726 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
3727 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[0]
3728 ; EXYNOSM1-NEXT: fmulx v0.2s, v0.2s, v1.2s
3729 ; EXYNOSM1-NEXT: ret
3731 ; EXYNOSM3-LABEL: test_vmulx_lane_f32_0:
3732 ; EXYNOSM3: // %bb.0: // %entry
3733 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
3734 ; EXYNOSM3-NEXT: fmulx v0.2s, v0.2s, v1.s[0]
3735 ; EXYNOSM3-NEXT: ret
3737 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
3738 %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
3739 ret <2 x float> %vmulx2.i
3742 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
3743 ; GENERIC-LABEL: test_vmulxq_lane_f32_0:
3744 ; GENERIC: // %bb.0: // %entry
3745 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
3746 ; GENERIC-NEXT: fmulx v0.4s, v0.4s, v1.s[0]
3749 ; EXYNOSM1-LABEL: test_vmulxq_lane_f32_0:
3750 ; EXYNOSM1: // %bb.0: // %entry
3751 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
3752 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[0]
3753 ; EXYNOSM1-NEXT: fmulx v0.4s, v0.4s, v1.4s
3754 ; EXYNOSM1-NEXT: ret
3756 ; EXYNOSM3-LABEL: test_vmulxq_lane_f32_0:
3757 ; EXYNOSM3: // %bb.0: // %entry
3758 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
3759 ; EXYNOSM3-NEXT: fmulx v0.4s, v0.4s, v1.s[0]
3760 ; EXYNOSM3-NEXT: ret
3762 %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
3763 %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
3764 ret <4 x float> %vmulx2.i
3767 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
3768 ; GENERIC-LABEL: test_vmulxq_lane_f64_0:
3769 ; GENERIC: // %bb.0: // %entry
3770 ; GENERIC-NEXT: // kill: def $d1 killed $d1 def $q1
3771 ; GENERIC-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
3774 ; EXYNOSM1-LABEL: test_vmulxq_lane_f64_0:
3775 ; EXYNOSM1: // %bb.0: // %entry
3776 ; EXYNOSM1-NEXT: // kill: def $d1 killed $d1 def $q1
3777 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[0]
3778 ; EXYNOSM1-NEXT: fmulx v0.2d, v0.2d, v1.2d
3779 ; EXYNOSM1-NEXT: ret
3781 ; EXYNOSM3-LABEL: test_vmulxq_lane_f64_0:
3782 ; EXYNOSM3: // %bb.0: // %entry
3783 ; EXYNOSM3-NEXT: // kill: def $d1 killed $d1 def $q1
3784 ; EXYNOSM3-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
3785 ; EXYNOSM3-NEXT: ret
3787 %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
3788 %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
3789 ret <2 x double> %vmulx2.i
3792 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
3793 ; GENERIC-LABEL: test_vmulx_laneq_f32_0:
3794 ; GENERIC: // %bb.0: // %entry
3795 ; GENERIC-NEXT: fmulx v0.2s, v0.2s, v1.s[0]
3798 ; EXYNOSM1-LABEL: test_vmulx_laneq_f32_0:
3799 ; EXYNOSM1: // %bb.0: // %entry
3800 ; EXYNOSM1-NEXT: dup v1.2s, v1.s[0]
3801 ; EXYNOSM1-NEXT: fmulx v0.2s, v0.2s, v1.2s
3802 ; EXYNOSM1-NEXT: ret
3804 ; EXYNOSM3-LABEL: test_vmulx_laneq_f32_0:
3805 ; EXYNOSM3: // %bb.0: // %entry
3806 ; EXYNOSM3-NEXT: fmulx v0.2s, v0.2s, v1.s[0]
3807 ; EXYNOSM3-NEXT: ret
3809 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
3810 %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
3811 ret <2 x float> %vmulx2.i
3814 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
3815 ; GENERIC-LABEL: test_vmulxq_laneq_f32_0:
3816 ; GENERIC: // %bb.0: // %entry
3817 ; GENERIC-NEXT: fmulx v0.4s, v0.4s, v1.s[0]
3820 ; EXYNOSM1-LABEL: test_vmulxq_laneq_f32_0:
3821 ; EXYNOSM1: // %bb.0: // %entry
3822 ; EXYNOSM1-NEXT: dup v1.4s, v1.s[0]
3823 ; EXYNOSM1-NEXT: fmulx v0.4s, v0.4s, v1.4s
3824 ; EXYNOSM1-NEXT: ret
3826 ; EXYNOSM3-LABEL: test_vmulxq_laneq_f32_0:
3827 ; EXYNOSM3: // %bb.0: // %entry
3828 ; EXYNOSM3-NEXT: fmulx v0.4s, v0.4s, v1.s[0]
3829 ; EXYNOSM3-NEXT: ret
3831 %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
3832 %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
3833 ret <4 x float> %vmulx2.i
3836 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
3837 ; GENERIC-LABEL: test_vmulxq_laneq_f64_0:
3838 ; GENERIC: // %bb.0: // %entry
3839 ; GENERIC-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
3842 ; EXYNOSM1-LABEL: test_vmulxq_laneq_f64_0:
3843 ; EXYNOSM1: // %bb.0: // %entry
3844 ; EXYNOSM1-NEXT: dup v1.2d, v1.d[0]
3845 ; EXYNOSM1-NEXT: fmulx v0.2d, v0.2d, v1.2d
3846 ; EXYNOSM1-NEXT: ret
3848 ; EXYNOSM3-LABEL: test_vmulxq_laneq_f64_0:
3849 ; EXYNOSM3: // %bb.0: // %entry
3850 ; EXYNOSM3-NEXT: fmulx v0.2d, v0.2d, v1.d[0]
3851 ; EXYNOSM3-NEXT: ret
3853 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
3854 %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
3855 ret <2 x double> %vmulx2.i
3858 define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
3859 ; GENERIC-LABEL: optimize_dup:
3860 ; GENERIC: // %bb.0: // %entry
3861 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v3.s[3]
3862 ; GENERIC-NEXT: fmls v0.4s, v2.4s, v3.s[3]
3865 ; EXYNOSM1-LABEL: optimize_dup:
3866 ; EXYNOSM1: // %bb.0: // %entry
3867 ; EXYNOSM1-NEXT: dup v3.4s, v3.s[3]
3868 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v3.4s
3869 ; EXYNOSM1-NEXT: fmls v0.4s, v2.4s, v3.4s
3870 ; EXYNOSM1-NEXT: ret
3872 ; EXYNOSM3-LABEL: optimize_dup:
3873 ; EXYNOSM3: // %bb.0: // %entry
3874 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v3.s[3]
3875 ; EXYNOSM3-NEXT: fmls v0.4s, v2.4s, v3.s[3]
3876 ; EXYNOSM3-NEXT: ret
3878 %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3879 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
3880 %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3881 %1 = fmul <4 x float> %lane2, %c
3882 %s = fsub <4 x float> %0, %1
3886 define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
3887 ; GENERIC-LABEL: no_optimize_dup:
3888 ; GENERIC: // %bb.0: // %entry
3889 ; GENERIC-NEXT: fmla v0.4s, v1.4s, v3.s[3]
3890 ; GENERIC-NEXT: fmls v0.4s, v2.4s, v3.s[1]
3893 ; EXYNOSM1-LABEL: no_optimize_dup:
3894 ; EXYNOSM1: // %bb.0: // %entry
3895 ; EXYNOSM1-NEXT: dup v4.4s, v3.s[3]
3896 ; EXYNOSM1-NEXT: fmla v0.4s, v1.4s, v4.4s
3897 ; EXYNOSM1-NEXT: dup v1.4s, v3.s[1]
3898 ; EXYNOSM1-NEXT: fmls v0.4s, v2.4s, v1.4s
3899 ; EXYNOSM1-NEXT: ret
3901 ; EXYNOSM3-LABEL: no_optimize_dup:
3902 ; EXYNOSM3: // %bb.0: // %entry
3903 ; EXYNOSM3-NEXT: fmla v0.4s, v1.4s, v3.s[3]
3904 ; EXYNOSM3-NEXT: fmls v0.4s, v2.4s, v3.s[1]
3905 ; EXYNOSM3-NEXT: ret
3907 %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
3908 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
3909 %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3910 %1 = fmul <4 x float> %lane2, %c
3911 %s = fsub <4 x float> %0, %1
3915 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
3916 ; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
3917 ; GENERIC: // %bb.0: // %entry
3918 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
3919 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[1]
3922 ; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
3923 ; EXYNOSM1: // %bb.0: // %entry
3924 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
3925 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[1]
3926 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
3927 ; EXYNOSM1-NEXT: ret
3929 ; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
3930 ; EXYNOSM3: // %bb.0: // %entry
3931 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
3932 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[1]
3933 ; EXYNOSM3-NEXT: ret
3935 %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
3936 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
3940 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {
3941 ; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
3942 ; GENERIC: // %bb.0: // %entry
3943 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
3944 ; GENERIC-NEXT: dup v2.2s, v2.s[1]
3945 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.2s
3948 ; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
3949 ; EXYNOSM1: // %bb.0: // %entry
3950 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
3951 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[1]
3952 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
3953 ; EXYNOSM1-NEXT: ret
3955 ; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
3956 ; EXYNOSM3: // %bb.0: // %entry
3957 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
3958 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[1]
3959 ; EXYNOSM3-NEXT: ret
3961 %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
3962 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
3966 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m3(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m3" {
3967 ; GENERIC-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
3968 ; GENERIC: // %bb.0: // %entry
3969 ; GENERIC-NEXT: // kill: def $d2 killed $d2 def $q2
3970 ; GENERIC-NEXT: fmla v0.2s, v1.2s, v2.s[1]
3973 ; EXYNOSM1-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
3974 ; EXYNOSM1: // %bb.0: // %entry
3975 ; EXYNOSM1-NEXT: // kill: def $d2 killed $d2 def $q2
3976 ; EXYNOSM1-NEXT: dup v2.2s, v2.s[1]
3977 ; EXYNOSM1-NEXT: fmla v0.2s, v1.2s, v2.2s
3978 ; EXYNOSM1-NEXT: ret
3980 ; EXYNOSM3-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
3981 ; EXYNOSM3: // %bb.0: // %entry
3982 ; EXYNOSM3-NEXT: // kill: def $d2 killed $d2 def $q2
3983 ; EXYNOSM3-NEXT: fmla v0.2s, v1.2s, v2.s[1]
3984 ; EXYNOSM3-NEXT: ret
3986 %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
3987 %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)