1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s
4 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
5 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
6 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
7 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
8 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
9 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
10 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
11 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
12 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
13 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
14 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
15 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
16 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
17 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
18 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
19 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
20 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
21 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
22 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
23 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
24 declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
25 declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
26 declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
27 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
28 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
29 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
31 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
32 ; CHECK-LABEL: test_vaddl_s8:
33 ; CHECK: // %bb.0: // %entry
34 ; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
37 %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
38 %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
39 %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
43 define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
44 ; CHECK-LABEL: test_vaddl_s16:
45 ; CHECK: // %bb.0: // %entry
46 ; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
49 %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
50 %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
51 %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
55 define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
56 ; CHECK-LABEL: test_vaddl_s32:
57 ; CHECK: // %bb.0: // %entry
58 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
61 %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
62 %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
63 %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
67 define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
68 ; CHECK-LABEL: test_vaddl_u8:
69 ; CHECK: // %bb.0: // %entry
70 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
73 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
74 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
75 %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
79 define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
80 ; CHECK-LABEL: test_vaddl_u16:
81 ; CHECK: // %bb.0: // %entry
82 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
85 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
86 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
87 %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
91 define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
92 ; CHECK-LABEL: test_vaddl_u32:
93 ; CHECK: // %bb.0: // %entry
94 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
97 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
98 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
99 %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
103 define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) {
104 ; CHECK-LABEL: test_vaddl_a8:
105 ; CHECK: // %bb.0: // %entry
106 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
107 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
110 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
111 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
112 %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
113 %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
117 define <4 x i32> @test_vaddl_a16(<4 x i16> %a, <4 x i16> %b) {
118 ; CHECK-LABEL: test_vaddl_a16:
119 ; CHECK: // %bb.0: // %entry
120 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
121 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
122 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
125 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
126 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
127 %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
128 %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
132 define <2 x i64> @test_vaddl_a32(<2 x i32> %a, <2 x i32> %b) {
133 ; CHECK-LABEL: test_vaddl_a32:
134 ; CHECK: // %bb.0: // %entry
135 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
136 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
137 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
140 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
141 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
142 %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
143 %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
147 define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
148 ; CHECK-LABEL: test_vaddl_high_s8:
149 ; CHECK: // %bb.0: // %entry
150 ; CHECK-NEXT: saddl2 v0.8h, v0.16b, v1.16b
153 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
154 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
155 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
156 %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
157 %add.i = add <8 x i16> %0, %1
161 define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
162 ; CHECK-LABEL: test_vaddl_high_s16:
163 ; CHECK: // %bb.0: // %entry
164 ; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h
167 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
168 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
169 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
170 %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
171 %add.i = add <4 x i32> %0, %1
175 define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
176 ; CHECK-LABEL: test_vaddl_high_s32:
177 ; CHECK: // %bb.0: // %entry
178 ; CHECK-NEXT: saddl2 v0.2d, v0.4s, v1.4s
181 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
182 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
183 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
184 %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
185 %add.i = add <2 x i64> %0, %1
189 define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
190 ; CHECK-LABEL: test_vaddl_high_u8:
191 ; CHECK: // %bb.0: // %entry
192 ; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b
195 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
196 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
197 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
198 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
199 %add.i = add <8 x i16> %0, %1
203 define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
204 ; CHECK-LABEL: test_vaddl_high_u16:
205 ; CHECK: // %bb.0: // %entry
206 ; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h
209 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
210 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
211 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
212 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
213 %add.i = add <4 x i32> %0, %1
217 define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
218 ; CHECK-LABEL: test_vaddl_high_u32:
219 ; CHECK: // %bb.0: // %entry
220 ; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
223 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
224 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
225 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
226 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
227 %add.i = add <2 x i64> %0, %1
231 define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) {
232 ; CHECK-LABEL: test_vaddl_high_a8:
233 ; CHECK: // %bb.0: // %entry
234 ; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b
235 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
238 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
239 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
240 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
241 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
242 %add.i = add <8 x i16> %0, %1
243 %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
247 define <4 x i32> @test_vaddl_high_a16(<8 x i16> %a, <8 x i16> %b) {
248 ; CHECK-LABEL: test_vaddl_high_a16:
249 ; CHECK: // %bb.0: // %entry
250 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
251 ; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h
252 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
255 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
256 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
257 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
258 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
259 %add.i = add <4 x i32> %0, %1
260 %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
264 define <2 x i64> @test_vaddl_high_a32(<4 x i32> %a, <4 x i32> %b) {
265 ; CHECK-LABEL: test_vaddl_high_a32:
266 ; CHECK: // %bb.0: // %entry
267 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
268 ; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
269 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
272 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
273 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
274 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
275 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
276 %add.i = add <2 x i64> %0, %1
277 %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
281 define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
282 ; CHECK-LABEL: test_vaddw_s8:
283 ; CHECK: // %bb.0: // %entry
284 ; CHECK-NEXT: saddw v0.8h, v0.8h, v1.8b
287 %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
288 %add.i = add <8 x i16> %vmovl.i.i, %a
292 define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
293 ; CHECK-LABEL: test_vaddw_s16:
294 ; CHECK: // %bb.0: // %entry
295 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
298 %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
299 %add.i = add <4 x i32> %vmovl.i.i, %a
303 define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
304 ; CHECK-LABEL: test_vaddw_s32:
305 ; CHECK: // %bb.0: // %entry
306 ; CHECK-NEXT: saddw v0.2d, v0.2d, v1.2s
309 %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
310 %add.i = add <2 x i64> %vmovl.i.i, %a
314 define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
315 ; CHECK-LABEL: test_vaddw_u8:
316 ; CHECK: // %bb.0: // %entry
317 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
320 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
321 %add.i = add <8 x i16> %vmovl.i.i, %a
325 define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
326 ; CHECK-LABEL: test_vaddw_u16:
327 ; CHECK: // %bb.0: // %entry
328 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
331 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
332 %add.i = add <4 x i32> %vmovl.i.i, %a
336 define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
337 ; CHECK-LABEL: test_vaddw_u32:
338 ; CHECK: // %bb.0: // %entry
339 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s
342 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
343 %add.i = add <2 x i64> %vmovl.i.i, %a
347 define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) {
348 ; CHECK-LABEL: test_vaddw_a8:
349 ; CHECK: // %bb.0: // %entry
350 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
351 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
354 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
355 %add.i = add <8 x i16> %vmovl.i.i, %a
356 %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
360 define <4 x i32> @test_vaddw_a16(<4 x i32> %a, <4 x i16> %b) {
361 ; CHECK-LABEL: test_vaddw_a16:
362 ; CHECK: // %bb.0: // %entry
363 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
364 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
365 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
368 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
369 %add.i = add <4 x i32> %vmovl.i.i, %a
370 %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
374 define <2 x i64> @test_vaddw_a32(<2 x i64> %a, <2 x i32> %b) {
375 ; CHECK-LABEL: test_vaddw_a32:
376 ; CHECK: // %bb.0: // %entry
377 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
378 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s
379 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
382 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
383 %add.i = add <2 x i64> %vmovl.i.i, %a
384 %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
388 define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
389 ; CHECK-LABEL: test_vaddw_high_s8:
390 ; CHECK: // %bb.0: // %entry
391 ; CHECK-NEXT: saddw2 v0.8h, v0.8h, v1.16b
394 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
396 %add.i = add <8 x i16> %0, %a
400 define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
401 ; CHECK-LABEL: test_vaddw_high_s16:
402 ; CHECK: // %bb.0: // %entry
403 ; CHECK-NEXT: saddw2 v0.4s, v0.4s, v1.8h
406 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
408 %add.i = add <4 x i32> %0, %a
412 define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
413 ; CHECK-LABEL: test_vaddw_high_s32:
414 ; CHECK: // %bb.0: // %entry
415 ; CHECK-NEXT: saddw2 v0.2d, v0.2d, v1.4s
418 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
419 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
420 %add.i = add <2 x i64> %0, %a
424 define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
425 ; CHECK-LABEL: test_vaddw_high_u8:
426 ; CHECK: // %bb.0: // %entry
427 ; CHECK-NEXT: uaddw2 v0.8h, v0.8h, v1.16b
430 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
431 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
432 %add.i = add <8 x i16> %0, %a
436 define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
437 ; CHECK-LABEL: test_vaddw_high_u16:
438 ; CHECK: // %bb.0: // %entry
439 ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
442 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
443 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
444 %add.i = add <4 x i32> %0, %a
448 define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
449 ; CHECK-LABEL: test_vaddw_high_u32:
450 ; CHECK: // %bb.0: // %entry
451 ; CHECK-NEXT: uaddw2 v0.2d, v0.2d, v1.4s
454 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
455 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
456 %add.i = add <2 x i64> %0, %a
460 define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) {
461 ; CHECK-LABEL: test_vaddw_high_a8:
462 ; CHECK: // %bb.0: // %entry
463 ; CHECK-NEXT: uaddw2 v0.8h, v0.8h, v1.16b
464 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
467 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
468 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
469 %add.i = add <8 x i16> %0, %a
470 %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
474 define <4 x i32> @test_vaddw_high_a16(<4 x i32> %a, <8 x i16> %b) {
475 ; CHECK-LABEL: test_vaddw_high_a16:
476 ; CHECK: // %bb.0: // %entry
477 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
478 ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
479 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
482 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
483 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
484 %add.i = add <4 x i32> %0, %a
485 %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
489 define <2 x i64> @test_vaddw_high_a32(<2 x i64> %a, <4 x i32> %b) {
490 ; CHECK-LABEL: test_vaddw_high_a32:
491 ; CHECK: // %bb.0: // %entry
492 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
493 ; CHECK-NEXT: uaddw2 v0.2d, v0.2d, v1.4s
494 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
497 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
498 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
499 %add.i = add <2 x i64> %0, %a
500 %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
504 define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
505 ; CHECK-LABEL: test_vsubl_s8:
506 ; CHECK: // %bb.0: // %entry
507 ; CHECK-NEXT: ssubl v0.8h, v0.8b, v1.8b
510 %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
511 %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
512 %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
516 define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
517 ; CHECK-LABEL: test_vsubl_s16:
518 ; CHECK: // %bb.0: // %entry
519 ; CHECK-NEXT: ssubl v0.4s, v0.4h, v1.4h
522 %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
523 %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
524 %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
528 define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
529 ; CHECK-LABEL: test_vsubl_s32:
530 ; CHECK: // %bb.0: // %entry
531 ; CHECK-NEXT: ssubl v0.2d, v0.2s, v1.2s
534 %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
535 %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
536 %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
540 define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
541 ; CHECK-LABEL: test_vsubl_u8:
542 ; CHECK: // %bb.0: // %entry
543 ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
546 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
547 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
548 %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
552 define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
553 ; CHECK-LABEL: test_vsubl_u16:
554 ; CHECK: // %bb.0: // %entry
555 ; CHECK-NEXT: usubl v0.4s, v0.4h, v1.4h
558 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
559 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
560 %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
564 define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
565 ; CHECK-LABEL: test_vsubl_u32:
566 ; CHECK: // %bb.0: // %entry
567 ; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s
570 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
571 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
572 %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
576 define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) {
577 ; CHECK-LABEL: test_vsubl_a8:
578 ; CHECK: // %bb.0: // %entry
579 ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
580 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
583 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
584 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
585 %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
586 %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
590 define <4 x i32> @test_vsubl_a16(<4 x i16> %a, <4 x i16> %b) {
591 ; CHECK-LABEL: test_vsubl_a16:
592 ; CHECK: // %bb.0: // %entry
593 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
594 ; CHECK-NEXT: usubl v0.4s, v0.4h, v1.4h
595 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
598 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
599 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
600 %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
601 %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
605 define <2 x i64> @test_vsubl_a32(<2 x i32> %a, <2 x i32> %b) {
606 ; CHECK-LABEL: test_vsubl_a32:
607 ; CHECK: // %bb.0: // %entry
608 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
609 ; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s
610 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
613 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
614 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
615 %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
616 %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
620 define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
621 ; CHECK-LABEL: test_vsubl_high_s8:
622 ; CHECK: // %bb.0: // %entry
623 ; CHECK-NEXT: ssubl2 v0.8h, v0.16b, v1.16b
626 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
627 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
628 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
629 %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
630 %sub.i = sub <8 x i16> %0, %1
634 define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
635 ; CHECK-LABEL: test_vsubl_high_s16:
636 ; CHECK: // %bb.0: // %entry
637 ; CHECK-NEXT: ssubl2 v0.4s, v0.8h, v1.8h
640 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
641 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
642 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
643 %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
644 %sub.i = sub <4 x i32> %0, %1
648 define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
649 ; CHECK-LABEL: test_vsubl_high_s32:
650 ; CHECK: // %bb.0: // %entry
651 ; CHECK-NEXT: ssubl2 v0.2d, v0.4s, v1.4s
654 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
655 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
656 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
657 %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
658 %sub.i = sub <2 x i64> %0, %1
662 define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
663 ; CHECK-LABEL: test_vsubl_high_u8:
664 ; CHECK: // %bb.0: // %entry
665 ; CHECK-NEXT: usubl2 v0.8h, v0.16b, v1.16b
668 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
669 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
670 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
671 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
672 %sub.i = sub <8 x i16> %0, %1
676 define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
677 ; CHECK-LABEL: test_vsubl_high_u16:
678 ; CHECK: // %bb.0: // %entry
679 ; CHECK-NEXT: usubl2 v0.4s, v0.8h, v1.8h
682 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
683 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
684 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
685 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
686 %sub.i = sub <4 x i32> %0, %1
690 define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
691 ; CHECK-LABEL: test_vsubl_high_u32:
692 ; CHECK: // %bb.0: // %entry
693 ; CHECK-NEXT: usubl2 v0.2d, v0.4s, v1.4s
696 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
697 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
698 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
699 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
700 %sub.i = sub <2 x i64> %0, %1
704 define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) {
705 ; CHECK-LABEL: test_vsubl_high_a8:
706 ; CHECK: // %bb.0: // %entry
707 ; CHECK-NEXT: usubl2 v0.8h, v0.16b, v1.16b
708 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
711 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
712 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
713 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
714 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
715 %sub.i = sub <8 x i16> %0, %1
716 %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
720 define <4 x i32> @test_vsubl_high_a16(<8 x i16> %a, <8 x i16> %b) {
721 ; CHECK-LABEL: test_vsubl_high_a16:
722 ; CHECK: // %bb.0: // %entry
723 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
724 ; CHECK-NEXT: usubl2 v0.4s, v0.8h, v1.8h
725 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
728 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
729 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
730 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
731 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
732 %sub.i = sub <4 x i32> %0, %1
733 %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
737 define <2 x i64> @test_vsubl_high_a32(<4 x i32> %a, <4 x i32> %b) {
738 ; CHECK-LABEL: test_vsubl_high_a32:
739 ; CHECK: // %bb.0: // %entry
740 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
741 ; CHECK-NEXT: usubl2 v0.2d, v0.4s, v1.4s
742 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
745 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
746 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
747 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
748 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
749 %sub.i = sub <2 x i64> %0, %1
750 %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
754 define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
755 ; CHECK-LABEL: test_vsubw_s8:
756 ; CHECK: // %bb.0: // %entry
757 ; CHECK-NEXT: ssubw v0.8h, v0.8h, v1.8b
760 %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
761 %sub.i = sub <8 x i16> %a, %vmovl.i.i
765 define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
766 ; CHECK-LABEL: test_vsubw_s16:
767 ; CHECK: // %bb.0: // %entry
768 ; CHECK-NEXT: ssubw v0.4s, v0.4s, v1.4h
771 %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
772 %sub.i = sub <4 x i32> %a, %vmovl.i.i
776 define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
777 ; CHECK-LABEL: test_vsubw_s32:
778 ; CHECK: // %bb.0: // %entry
779 ; CHECK-NEXT: ssubw v0.2d, v0.2d, v1.2s
782 %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
783 %sub.i = sub <2 x i64> %a, %vmovl.i.i
787 define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
788 ; CHECK-LABEL: test_vsubw_u8:
789 ; CHECK: // %bb.0: // %entry
790 ; CHECK-NEXT: usubw v0.8h, v0.8h, v1.8b
793 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
794 %sub.i = sub <8 x i16> %a, %vmovl.i.i
798 define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
799 ; CHECK-LABEL: test_vsubw_u16:
800 ; CHECK: // %bb.0: // %entry
801 ; CHECK-NEXT: usubw v0.4s, v0.4s, v1.4h
804 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
805 %sub.i = sub <4 x i32> %a, %vmovl.i.i
809 define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
810 ; CHECK-LABEL: test_vsubw_u32:
811 ; CHECK: // %bb.0: // %entry
812 ; CHECK-NEXT: usubw v0.2d, v0.2d, v1.2s
815 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
816 %sub.i = sub <2 x i64> %a, %vmovl.i.i
820 define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) {
821 ; CHECK-LABEL: test_vsubw_a8:
822 ; CHECK: // %bb.0: // %entry
823 ; CHECK-NEXT: usubw v0.8h, v0.8h, v1.8b
824 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
827 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
828 %sub.i = sub <8 x i16> %a, %vmovl.i.i
829 %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
833 define <4 x i32> @test_vsubw_a16(<4 x i32> %a, <4 x i16> %b) {
834 ; CHECK-LABEL: test_vsubw_a16:
835 ; CHECK: // %bb.0: // %entry
836 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
837 ; CHECK-NEXT: usubw v0.4s, v0.4s, v1.4h
838 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
841 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
842 %sub.i = sub <4 x i32> %a, %vmovl.i.i
843 %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
847 define <2 x i64> @test_vsubw_a32(<2 x i64> %a, <2 x i32> %b) {
848 ; CHECK-LABEL: test_vsubw_a32:
849 ; CHECK: // %bb.0: // %entry
850 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
851 ; CHECK-NEXT: usubw v0.2d, v0.2d, v1.2s
852 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
855 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
856 %sub.i = sub <2 x i64> %a, %vmovl.i.i
857 %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
861 define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
862 ; CHECK-LABEL: test_vsubw_high_s8:
863 ; CHECK: // %bb.0: // %entry
864 ; CHECK-NEXT: ssubw2 v0.8h, v0.8h, v1.16b
867 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
868 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
869 %sub.i = sub <8 x i16> %a, %0
873 define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
874 ; CHECK-LABEL: test_vsubw_high_s16:
875 ; CHECK: // %bb.0: // %entry
876 ; CHECK-NEXT: ssubw2 v0.4s, v0.4s, v1.8h
879 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
880 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
881 %sub.i = sub <4 x i32> %a, %0
885 define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
886 ; CHECK-LABEL: test_vsubw_high_s32:
887 ; CHECK: // %bb.0: // %entry
888 ; CHECK-NEXT: ssubw2 v0.2d, v0.2d, v1.4s
891 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
892 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
893 %sub.i = sub <2 x i64> %a, %0
897 define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
898 ; CHECK-LABEL: test_vsubw_high_u8:
899 ; CHECK: // %bb.0: // %entry
900 ; CHECK-NEXT: usubw2 v0.8h, v0.8h, v1.16b
903 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
904 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
905 %sub.i = sub <8 x i16> %a, %0
909 define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
910 ; CHECK-LABEL: test_vsubw_high_u16:
911 ; CHECK: // %bb.0: // %entry
912 ; CHECK-NEXT: usubw2 v0.4s, v0.4s, v1.8h
915 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
916 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
917 %sub.i = sub <4 x i32> %a, %0
921 define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
922 ; CHECK-LABEL: test_vsubw_high_u32:
923 ; CHECK: // %bb.0: // %entry
924 ; CHECK-NEXT: usubw2 v0.2d, v0.2d, v1.4s
927 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
928 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
929 %sub.i = sub <2 x i64> %a, %0
933 define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) {
934 ; CHECK-LABEL: test_vsubw_high_a8:
935 ; CHECK: // %bb.0: // %entry
936 ; CHECK-NEXT: usubw2 v0.8h, v0.8h, v1.16b
937 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
940 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
941 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
942 %sub.i = sub <8 x i16> %a, %0
943 %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
947 define <4 x i32> @test_vsubw_high_a16(<4 x i32> %a, <8 x i16> %b) {
948 ; CHECK-LABEL: test_vsubw_high_a16:
949 ; CHECK: // %bb.0: // %entry
950 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
951 ; CHECK-NEXT: usubw2 v0.4s, v0.4s, v1.8h
952 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
955 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
956 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
957 %sub.i = sub <4 x i32> %a, %0
958 %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
962 define <2 x i64> @test_vsubw_high_a32(<2 x i64> %a, <4 x i32> %b) {
963 ; CHECK-LABEL: test_vsubw_high_a32:
964 ; CHECK: // %bb.0: // %entry
965 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
966 ; CHECK-NEXT: usubw2 v0.2d, v0.2d, v1.4s
967 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
970 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
971 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
972 %sub.i = sub <2 x i64> %a, %0
973 %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
977 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
978 ; CHECK-LABEL: test_vaddhn_s16:
979 ; CHECK: // %bb.0: // %entry
980 ; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h
983 %vaddhn.i = add <8 x i16> %a, %b
984 %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
985 %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
986 ret <8 x i8> %vaddhn2.i
989 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
990 ; CHECK-LABEL: test_vaddhn_s32:
991 ; CHECK: // %bb.0: // %entry
992 ; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s
995 %vaddhn.i = add <4 x i32> %a, %b
996 %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
997 %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
998 ret <4 x i16> %vaddhn2.i
1001 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
1002 ; CHECK-LABEL: test_vaddhn_s64:
1003 ; CHECK: // %bb.0: // %entry
1004 ; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d
1007 %vaddhn.i = add <2 x i64> %a, %b
1008 %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
1009 %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
1010 ret <2 x i32> %vaddhn2.i
1013 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
1014 ; CHECK-LABEL: test_vaddhn_u16:
1015 ; CHECK: // %bb.0: // %entry
1016 ; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h
1019 %vaddhn.i = add <8 x i16> %a, %b
1020 %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1021 %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
1022 ret <8 x i8> %vaddhn2.i
1025 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
1026 ; CHECK-LABEL: test_vaddhn_u32:
1027 ; CHECK: // %bb.0: // %entry
1028 ; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s
1031 %vaddhn.i = add <4 x i32> %a, %b
1032 %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
1033 %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
1034 ret <4 x i16> %vaddhn2.i
1037 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
1038 ; CHECK-LABEL: test_vaddhn_u64:
1039 ; CHECK: // %bb.0: // %entry
1040 ; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d
1043 %vaddhn.i = add <2 x i64> %a, %b
1044 %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
1045 %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
1046 ret <2 x i32> %vaddhn2.i
1049 define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1050 ; CHECK-LABEL: test_vaddhn_high_s16:
1051 ; CHECK: // %bb.0: // %entry
1052 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1053 ; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h
1056 %vaddhn.i.i = add <8 x i16> %a, %b
1057 %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1058 %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
1059 %0 = bitcast <8 x i8> %r to <1 x i64>
1060 %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
1061 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1062 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1066 define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1067 ; CHECK-LABEL: test_vaddhn_high_s32:
1068 ; CHECK: // %bb.0: // %entry
1069 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1070 ; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s
1073 %vaddhn.i.i = add <4 x i32> %a, %b
1074 %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1075 %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
1076 %0 = bitcast <4 x i16> %r to <1 x i64>
1077 %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
1078 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1079 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1083 define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1084 ; CHECK-LABEL: test_vaddhn_high_s64:
1085 ; CHECK: // %bb.0: // %entry
1086 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1087 ; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d
1090 %vaddhn.i.i = add <2 x i64> %a, %b
1091 %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
1092 %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
1093 %0 = bitcast <2 x i32> %r to <1 x i64>
1094 %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
1095 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1096 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1100 define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1101 ; CHECK-LABEL: test_vaddhn_high_u16:
1102 ; CHECK: // %bb.0: // %entry
1103 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1104 ; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h
1107 %vaddhn.i.i = add <8 x i16> %a, %b
1108 %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1109 %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
1110 %0 = bitcast <8 x i8> %r to <1 x i64>
1111 %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
1112 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1113 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1117 define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1118 ; CHECK-LABEL: test_vaddhn_high_u32:
1119 ; CHECK: // %bb.0: // %entry
1120 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1121 ; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s
1124 %vaddhn.i.i = add <4 x i32> %a, %b
1125 %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1126 %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
1127 %0 = bitcast <4 x i16> %r to <1 x i64>
1128 %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
1129 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1130 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1134 define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1135 ; CHECK-LABEL: test_vaddhn_high_u64:
1136 ; CHECK: // %bb.0: // %entry
1137 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1138 ; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d
1141 %vaddhn.i.i = add <2 x i64> %a, %b
1142 %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
1143 %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
1144 %0 = bitcast <2 x i32> %r to <1 x i64>
1145 %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
1146 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1147 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1151 define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
1152 ; CHECK-LABEL: test_vraddhn_s16:
1153 ; CHECK: // %bb.0: // %entry
1154 ; CHECK-NEXT: raddhn v0.8b, v0.8h, v1.8h
1157 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1158 ret <8 x i8> %vraddhn2.i
1161 define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
1162 ; CHECK-LABEL: test_vraddhn_s32:
1163 ; CHECK: // %bb.0: // %entry
1164 ; CHECK-NEXT: raddhn v0.4h, v0.4s, v1.4s
1167 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1168 ret <4 x i16> %vraddhn2.i
1171 define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
1172 ; CHECK-LABEL: test_vraddhn_s64:
1173 ; CHECK: // %bb.0: // %entry
1174 ; CHECK-NEXT: raddhn v0.2s, v0.2d, v1.2d
1177 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1178 ret <2 x i32> %vraddhn2.i
1181 define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
1182 ; CHECK-LABEL: test_vraddhn_u16:
1183 ; CHECK: // %bb.0: // %entry
1184 ; CHECK-NEXT: raddhn v0.8b, v0.8h, v1.8h
1187 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1188 ret <8 x i8> %vraddhn2.i
1191 define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
1192 ; CHECK-LABEL: test_vraddhn_u32:
1193 ; CHECK: // %bb.0: // %entry
1194 ; CHECK-NEXT: raddhn v0.4h, v0.4s, v1.4s
1197 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1198 ret <4 x i16> %vraddhn2.i
1201 define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
1202 ; CHECK-LABEL: test_vraddhn_u64:
1203 ; CHECK: // %bb.0: // %entry
1204 ; CHECK-NEXT: raddhn v0.2s, v0.2d, v1.2d
1207 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1208 ret <2 x i32> %vraddhn2.i
1211 define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1212 ; CHECK-LABEL: test_vraddhn_high_s16:
1213 ; CHECK: // %bb.0: // %entry
1214 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1215 ; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h
1218 %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1219 %0 = bitcast <8 x i8> %r to <1 x i64>
1220 %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
1221 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1222 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1226 define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1227 ; CHECK-LABEL: test_vraddhn_high_s32:
1228 ; CHECK: // %bb.0: // %entry
1229 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1230 ; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s
1233 %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1234 %0 = bitcast <4 x i16> %r to <1 x i64>
1235 %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
1236 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1237 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1241 define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1242 ; CHECK-LABEL: test_vraddhn_high_s64:
1243 ; CHECK: // %bb.0: // %entry
1244 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1245 ; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d
1248 %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1249 %0 = bitcast <2 x i32> %r to <1 x i64>
1250 %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
1251 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1252 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1256 define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1257 ; CHECK-LABEL: test_vraddhn_high_u16:
1258 ; CHECK: // %bb.0: // %entry
1259 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1260 ; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h
1263 %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1264 %0 = bitcast <8 x i8> %r to <1 x i64>
1265 %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
1266 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1267 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1271 define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1272 ; CHECK-LABEL: test_vraddhn_high_u32:
1273 ; CHECK: // %bb.0: // %entry
1274 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1275 ; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s
1278 %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1279 %0 = bitcast <4 x i16> %r to <1 x i64>
1280 %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
1281 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1282 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1286 define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1287 ; CHECK-LABEL: test_vraddhn_high_u64:
1288 ; CHECK: // %bb.0: // %entry
1289 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1290 ; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d
1293 %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1294 %0 = bitcast <2 x i32> %r to <1 x i64>
1295 %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
1296 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1297 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1301 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
1302 ; CHECK-LABEL: test_vsubhn_s16:
1303 ; CHECK: // %bb.0: // %entry
1304 ; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h
1307 %vsubhn.i = sub <8 x i16> %a, %b
1308 %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1309 %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
1310 ret <8 x i8> %vsubhn2.i
1313 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
1314 ; CHECK-LABEL: test_vsubhn_s32:
1315 ; CHECK: // %bb.0: // %entry
1316 ; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s
1319 %vsubhn.i = sub <4 x i32> %a, %b
1320 %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
1321 %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
1322 ret <4 x i16> %vsubhn2.i
1325 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
1326 ; CHECK-LABEL: test_vsubhn_s64:
1327 ; CHECK: // %bb.0: // %entry
1328 ; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d
1331 %vsubhn.i = sub <2 x i64> %a, %b
1332 %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
1333 %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
1334 ret <2 x i32> %vsubhn2.i
1337 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
1338 ; CHECK-LABEL: test_vsubhn_u16:
1339 ; CHECK: // %bb.0: // %entry
1340 ; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h
1343 %vsubhn.i = sub <8 x i16> %a, %b
1344 %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1345 %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
1346 ret <8 x i8> %vsubhn2.i
1349 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
1350 ; CHECK-LABEL: test_vsubhn_u32:
1351 ; CHECK: // %bb.0: // %entry
1352 ; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s
1355 %vsubhn.i = sub <4 x i32> %a, %b
1356 %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
1357 %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
1358 ret <4 x i16> %vsubhn2.i
1361 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
1362 ; CHECK-LABEL: test_vsubhn_u64:
1363 ; CHECK: // %bb.0: // %entry
1364 ; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d
1367 %vsubhn.i = sub <2 x i64> %a, %b
1368 %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
1369 %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
1370 ret <2 x i32> %vsubhn2.i
1373 define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1374 ; CHECK-LABEL: test_vsubhn_high_s16:
1375 ; CHECK: // %bb.0: // %entry
1376 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1377 ; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h
1380 %vsubhn.i.i = sub <8 x i16> %a, %b
1381 %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1382 %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
1383 %0 = bitcast <8 x i8> %r to <1 x i64>
1384 %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
1385 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1386 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1390 define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1391 ; CHECK-LABEL: test_vsubhn_high_s32:
1392 ; CHECK: // %bb.0: // %entry
1393 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1394 ; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s
1397 %vsubhn.i.i = sub <4 x i32> %a, %b
1398 %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1399 %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
1400 %0 = bitcast <4 x i16> %r to <1 x i64>
1401 %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
1402 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1403 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1407 define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1408 ; CHECK-LABEL: test_vsubhn_high_s64:
1409 ; CHECK: // %bb.0: // %entry
1410 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1411 ; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d
1414 %vsubhn.i.i = sub <2 x i64> %a, %b
1415 %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
1416 %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
1417 %0 = bitcast <2 x i32> %r to <1 x i64>
1418 %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
1419 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1420 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1424 define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1425 ; CHECK-LABEL: test_vsubhn_high_u16:
1426 ; CHECK: // %bb.0: // %entry
1427 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1428 ; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h
1431 %vsubhn.i.i = sub <8 x i16> %a, %b
1432 %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1433 %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
1434 %0 = bitcast <8 x i8> %r to <1 x i64>
1435 %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
1436 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1437 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1441 define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1442 ; CHECK-LABEL: test_vsubhn_high_u32:
1443 ; CHECK: // %bb.0: // %entry
1444 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1445 ; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s
1448 %vsubhn.i.i = sub <4 x i32> %a, %b
1449 %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1450 %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
1451 %0 = bitcast <4 x i16> %r to <1 x i64>
1452 %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
1453 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1454 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1458 define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1459 ; CHECK-LABEL: test_vsubhn_high_u64:
1460 ; CHECK: // %bb.0: // %entry
1461 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1462 ; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d
1465 %vsubhn.i.i = sub <2 x i64> %a, %b
1466 %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
1467 %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
1468 %0 = bitcast <2 x i32> %r to <1 x i64>
1469 %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
1470 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1471 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1475 define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
1476 ; CHECK-LABEL: test_vrsubhn_s16:
1477 ; CHECK: // %bb.0: // %entry
1478 ; CHECK-NEXT: rsubhn v0.8b, v0.8h, v1.8h
1481 %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1482 ret <8 x i8> %vrsubhn2.i
1485 define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
1486 ; CHECK-LABEL: test_vrsubhn_s32:
1487 ; CHECK: // %bb.0: // %entry
1488 ; CHECK-NEXT: rsubhn v0.4h, v0.4s, v1.4s
1491 %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1492 ret <4 x i16> %vrsubhn2.i
1495 define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
1496 ; CHECK-LABEL: test_vrsubhn_s64:
1497 ; CHECK: // %bb.0: // %entry
1498 ; CHECK-NEXT: rsubhn v0.2s, v0.2d, v1.2d
1501 %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1502 ret <2 x i32> %vrsubhn2.i
1505 define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
1506 ; CHECK-LABEL: test_vrsubhn_u16:
1507 ; CHECK: // %bb.0: // %entry
1508 ; CHECK-NEXT: rsubhn v0.8b, v0.8h, v1.8h
1511 %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1512 ret <8 x i8> %vrsubhn2.i
1515 define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
1516 ; CHECK-LABEL: test_vrsubhn_u32:
1517 ; CHECK: // %bb.0: // %entry
1518 ; CHECK-NEXT: rsubhn v0.4h, v0.4s, v1.4s
1521 %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1522 ret <4 x i16> %vrsubhn2.i
1525 define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
1526 ; CHECK-LABEL: test_vrsubhn_u64:
1527 ; CHECK: // %bb.0: // %entry
1528 ; CHECK-NEXT: rsubhn v0.2s, v0.2d, v1.2d
1531 %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1532 ret <2 x i32> %vrsubhn2.i
1535 define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1536 ; CHECK-LABEL: test_vrsubhn_high_s16:
1537 ; CHECK: // %bb.0: // %entry
1538 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1539 ; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h
1542 %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1543 %0 = bitcast <8 x i8> %r to <1 x i64>
1544 %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1545 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1546 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1550 define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1551 ; CHECK-LABEL: test_vrsubhn_high_s32:
1552 ; CHECK: // %bb.0: // %entry
1553 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1554 ; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s
1557 %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1558 %0 = bitcast <4 x i16> %r to <1 x i64>
1559 %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1560 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1561 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1565 define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1566 ; CHECK-LABEL: test_vrsubhn_high_s64:
1567 ; CHECK: // %bb.0: // %entry
1568 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1569 ; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d
1572 %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1573 %0 = bitcast <2 x i32> %r to <1 x i64>
1574 %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1575 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1576 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1580 define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1581 ; CHECK-LABEL: test_vrsubhn_high_u16:
1582 ; CHECK: // %bb.0: // %entry
1583 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1584 ; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h
1587 %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1588 %0 = bitcast <8 x i8> %r to <1 x i64>
1589 %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1590 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1591 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1595 define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1596 ; CHECK-LABEL: test_vrsubhn_high_u32:
1597 ; CHECK: // %bb.0: // %entry
1598 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1599 ; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s
1602 %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1603 %0 = bitcast <4 x i16> %r to <1 x i64>
1604 %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1605 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1606 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1610 define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1611 ; CHECK-LABEL: test_vrsubhn_high_u64:
1612 ; CHECK: // %bb.0: // %entry
1613 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1614 ; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d
1617 %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1618 %0 = bitcast <2 x i32> %r to <1 x i64>
1619 %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1620 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1621 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1625 define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
1626 ; CHECK-LABEL: test_vabdl_s8:
1627 ; CHECK: // %bb.0: // %entry
1628 ; CHECK-NEXT: sabdl v0.8h, v0.8b, v1.8b
1631 %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
1632 %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1633 ret <8 x i16> %vmovl.i.i
1636 define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
1637 ; CHECK-LABEL: test_vabdl_s16:
1638 ; CHECK: // %bb.0: // %entry
1639 ; CHECK-NEXT: sabdl v0.4s, v0.4h, v1.4h
1642 %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
1643 %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1644 ret <4 x i32> %vmovl.i.i
1647 define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
1648 ; CHECK-LABEL: test_vabdl_s32:
1649 ; CHECK: // %bb.0: // %entry
1650 ; CHECK-NEXT: sabdl v0.2d, v0.2s, v1.2s
1653 %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
1654 %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1655 ret <2 x i64> %vmovl.i.i
1658 define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
1659 ; CHECK-LABEL: test_vabdl_u8:
1660 ; CHECK: // %bb.0: // %entry
1661 ; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b
1664 %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
1665 %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1666 ret <8 x i16> %vmovl.i.i
1669 define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
1670 ; CHECK-LABEL: test_vabdl_u16:
1671 ; CHECK: // %bb.0: // %entry
1672 ; CHECK-NEXT: uabdl v0.4s, v0.4h, v1.4h
1675 %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
1676 %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1677 ret <4 x i32> %vmovl.i.i
1680 define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
1681 ; CHECK-LABEL: test_vabdl_u32:
1682 ; CHECK: // %bb.0: // %entry
1683 ; CHECK-NEXT: uabdl v0.2d, v0.2s, v1.2s
1686 %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
1687 %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1688 ret <2 x i64> %vmovl.i.i
1691 define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1692 ; CHECK-LABEL: test_vabal_s8:
1693 ; CHECK: // %bb.0: // %entry
1694 ; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b
1697 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
1698 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1699 %add.i = add <8 x i16> %vmovl.i.i.i, %a
1700 ret <8 x i16> %add.i
1703 define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1704 ; CHECK-LABEL: test_vabal_s16:
1705 ; CHECK: // %bb.0: // %entry
1706 ; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h
1709 %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
1710 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1711 %add.i = add <4 x i32> %vmovl.i.i.i, %a
1712 ret <4 x i32> %add.i
1715 define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1716 ; CHECK-LABEL: test_vabal_s32:
1717 ; CHECK: // %bb.0: // %entry
1718 ; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s
1721 %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
1722 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1723 %add.i = add <2 x i64> %vmovl.i.i.i, %a
1724 ret <2 x i64> %add.i
1727 define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1728 ; CHECK-LABEL: test_vabal_u8:
1729 ; CHECK: // %bb.0: // %entry
1730 ; CHECK-NEXT: uabal v0.8h, v1.8b, v2.8b
1733 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
1734 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1735 %add.i = add <8 x i16> %vmovl.i.i.i, %a
1736 ret <8 x i16> %add.i
1739 define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1740 ; CHECK-LABEL: test_vabal_u16:
1741 ; CHECK: // %bb.0: // %entry
1742 ; CHECK-NEXT: uabal v0.4s, v1.4h, v2.4h
1745 %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
1746 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1747 %add.i = add <4 x i32> %vmovl.i.i.i, %a
1748 ret <4 x i32> %add.i
1751 define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1752 ; CHECK-LABEL: test_vabal_u32:
1753 ; CHECK: // %bb.0: // %entry
1754 ; CHECK-NEXT: uabal v0.2d, v1.2s, v2.2s
1757 %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
1758 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1759 %add.i = add <2 x i64> %vmovl.i.i.i, %a
1760 ret <2 x i64> %add.i
1763 define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
1764 ; CHECK-LABEL: test_vabdl_high_s8:
1765 ; CHECK: // %bb.0: // %entry
1766 ; CHECK-NEXT: sabdl2 v0.8h, v0.16b, v1.16b
1769 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1770 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1771 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1772 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1773 ret <8 x i16> %vmovl.i.i.i
1776 define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
1777 ; CHECK-LABEL: test_vabdl_high_s16:
1778 ; CHECK: // %bb.0: // %entry
1779 ; CHECK-NEXT: sabdl2 v0.4s, v0.8h, v1.8h
1782 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1783 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1784 %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1785 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1786 ret <4 x i32> %vmovl.i.i.i
1789 define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
1790 ; CHECK-LABEL: test_vabdl_high_s32:
1791 ; CHECK: // %bb.0: // %entry
1792 ; CHECK-NEXT: sabdl2 v0.2d, v0.4s, v1.4s
1795 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1796 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1797 %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1798 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1799 ret <2 x i64> %vmovl.i.i.i
1802 define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
1803 ; CHECK-LABEL: test_vabdl_high_u8:
1804 ; CHECK: // %bb.0: // %entry
1805 ; CHECK-NEXT: uabdl2 v0.8h, v0.16b, v1.16b
1808 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1809 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1810 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1811 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1812 ret <8 x i16> %vmovl.i.i.i
1815 define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
1816 ; CHECK-LABEL: test_vabdl_high_u16:
1817 ; CHECK: // %bb.0: // %entry
1818 ; CHECK-NEXT: uabdl2 v0.4s, v0.8h, v1.8h
1821 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1822 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1823 %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1824 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1825 ret <4 x i32> %vmovl.i.i.i
1828 define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
1829 ; CHECK-LABEL: test_vabdl_high_u32:
1830 ; CHECK: // %bb.0: // %entry
1831 ; CHECK-NEXT: uabdl2 v0.2d, v0.4s, v1.4s
1834 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1835 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1836 %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1837 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1838 ret <2 x i64> %vmovl.i.i.i
1841 define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1842 ; CHECK-LABEL: test_vabal_high_s8:
1843 ; CHECK: // %bb.0: // %entry
1844 ; CHECK-NEXT: sabal2 v0.8h, v1.16b, v2.16b
1847 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1848 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1849 %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1850 %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1851 %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1852 ret <8 x i16> %add.i.i
1855 define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1856 ; CHECK-LABEL: test_vabal_high_s16:
1857 ; CHECK: // %bb.0: // %entry
1858 ; CHECK-NEXT: sabal2 v0.4s, v1.8h, v2.8h
1861 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1862 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1863 %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1864 %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1865 %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1866 ret <4 x i32> %add.i.i
1869 define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1870 ; CHECK-LABEL: test_vabal_high_s32:
1871 ; CHECK: // %bb.0: // %entry
1872 ; CHECK-NEXT: sabal2 v0.2d, v1.4s, v2.4s
1875 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1876 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1877 %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1878 %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1879 %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1880 ret <2 x i64> %add.i.i
1883 define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1884 ; CHECK-LABEL: test_vabal_high_u8:
1885 ; CHECK: // %bb.0: // %entry
1886 ; CHECK-NEXT: uabal2 v0.8h, v1.16b, v2.16b
1889 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1890 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1891 %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1892 %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1893 %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1894 ret <8 x i16> %add.i.i
1897 define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1898 ; CHECK-LABEL: test_vabal_high_u16:
1899 ; CHECK: // %bb.0: // %entry
1900 ; CHECK-NEXT: uabal2 v0.4s, v1.8h, v2.8h
1903 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1904 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1905 %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1906 %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1907 %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1908 ret <4 x i32> %add.i.i
1911 define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1912 ; CHECK-LABEL: test_vabal_high_u32:
1913 ; CHECK: // %bb.0: // %entry
1914 ; CHECK-NEXT: uabal2 v0.2d, v1.4s, v2.4s
1917 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1918 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1919 %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1920 %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1921 %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1922 ret <2 x i64> %add.i.i
1925 define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
1926 ; CHECK-LABEL: test_vmull_s8:
1927 ; CHECK: // %bb.0: // %entry
1928 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
1931 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
1932 ret <8 x i16> %vmull.i
1935 define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
1936 ; CHECK-LABEL: test_vmull_s16:
1937 ; CHECK: // %bb.0: // %entry
1938 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
1941 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
1942 ret <4 x i32> %vmull2.i
1945 define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
1946 ; CHECK-LABEL: test_vmull_s32:
1947 ; CHECK: // %bb.0: // %entry
1948 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
1951 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
1952 ret <2 x i64> %vmull2.i
1955 define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
1956 ; CHECK-LABEL: test_vmull_u8:
1957 ; CHECK: // %bb.0: // %entry
1958 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
1961 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
1962 ret <8 x i16> %vmull.i
1965 define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
1966 ; CHECK-LABEL: test_vmull_u16:
1967 ; CHECK: // %bb.0: // %entry
1968 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
1971 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
1972 ret <4 x i32> %vmull2.i
1975 define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
1976 ; CHECK-LABEL: test_vmull_u32:
1977 ; CHECK: // %bb.0: // %entry
1978 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
1981 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
1982 ret <2 x i64> %vmull2.i
1985 define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
1986 ; CHECK-LABEL: test_vmull_high_s8:
1987 ; CHECK: // %bb.0: // %entry
1988 ; CHECK-NEXT: smull2 v0.8h, v0.16b, v1.16b
1991 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1992 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1993 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1994 ret <8 x i16> %vmull.i.i
1997 define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
1998 ; CHECK-LABEL: test_vmull_high_s16:
1999 ; CHECK: // %bb.0: // %entry
2000 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
2003 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2005 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2006 ret <4 x i32> %vmull2.i.i
2009 define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
2010 ; CHECK-LABEL: test_vmull_high_s32:
2011 ; CHECK: // %bb.0: // %entry
2012 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
2015 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2016 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2017 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2018 ret <2 x i64> %vmull2.i.i
2021 define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
2022 ; CHECK-LABEL: test_vmull_high_u8:
2023 ; CHECK: // %bb.0: // %entry
2024 ; CHECK-NEXT: umull2 v0.8h, v0.16b, v1.16b
2027 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2028 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2029 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2030 ret <8 x i16> %vmull.i.i
2033 define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
2034 ; CHECK-LABEL: test_vmull_high_u16:
2035 ; CHECK: // %bb.0: // %entry
2036 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
2039 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2040 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2041 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2042 ret <4 x i32> %vmull2.i.i
2045 define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
2046 ; CHECK-LABEL: test_vmull_high_u32:
2047 ; CHECK: // %bb.0: // %entry
2048 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
2051 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2052 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2053 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2054 ret <2 x i64> %vmull2.i.i
2057 define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2058 ; CHECK-LABEL: test_vmlal_s8:
2059 ; CHECK: // %bb.0: // %entry
2060 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
2063 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
2064 %add.i = add <8 x i16> %vmull.i.i, %a
2065 ret <8 x i16> %add.i
2068 define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2069 ; CHECK-LABEL: test_vmlal_s16:
2070 ; CHECK: // %bb.0: // %entry
2071 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
2074 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
2075 %add.i = add <4 x i32> %vmull2.i.i, %a
2076 ret <4 x i32> %add.i
2079 define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2080 ; CHECK-LABEL: test_vmlal_s32:
2081 ; CHECK: // %bb.0: // %entry
2082 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
2085 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
2086 %add.i = add <2 x i64> %vmull2.i.i, %a
2087 ret <2 x i64> %add.i
2090 define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2091 ; CHECK-LABEL: test_vmlal_u8:
2092 ; CHECK: // %bb.0: // %entry
2093 ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
2096 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
2097 %add.i = add <8 x i16> %vmull.i.i, %a
2098 ret <8 x i16> %add.i
2101 define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2102 ; CHECK-LABEL: test_vmlal_u16:
2103 ; CHECK: // %bb.0: // %entry
2104 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
2107 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
2108 %add.i = add <4 x i32> %vmull2.i.i, %a
2109 ret <4 x i32> %add.i
2112 define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2113 ; CHECK-LABEL: test_vmlal_u32:
2114 ; CHECK: // %bb.0: // %entry
2115 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
2118 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
2119 %add.i = add <2 x i64> %vmull2.i.i, %a
2120 ret <2 x i64> %add.i
2123 define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2124 ; CHECK-LABEL: test_vmlal_high_s8:
2125 ; CHECK: // %bb.0: // %entry
2126 ; CHECK-NEXT: smlal2 v0.8h, v1.16b, v2.16b
2129 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2130 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2131 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2132 %add.i.i = add <8 x i16> %vmull.i.i.i, %a
2133 ret <8 x i16> %add.i.i
2136 define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2137 ; CHECK-LABEL: test_vmlal_high_s16:
2138 ; CHECK: // %bb.0: // %entry
2139 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.8h
2142 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2143 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2144 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2145 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
2146 ret <4 x i32> %add.i.i
2149 define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2150 ; CHECK-LABEL: test_vmlal_high_s32:
2151 ; CHECK: // %bb.0: // %entry
2152 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
2155 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2156 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2157 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2158 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
2159 ret <2 x i64> %add.i.i
2162 define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2163 ; CHECK-LABEL: test_vmlal_high_u8:
2164 ; CHECK: // %bb.0: // %entry
2165 ; CHECK-NEXT: umlal2 v0.8h, v1.16b, v2.16b
2168 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2169 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2170 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2171 %add.i.i = add <8 x i16> %vmull.i.i.i, %a
2172 ret <8 x i16> %add.i.i
2175 define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2176 ; CHECK-LABEL: test_vmlal_high_u16:
2177 ; CHECK: // %bb.0: // %entry
2178 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.8h
2181 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2182 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2183 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2184 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
2185 ret <4 x i32> %add.i.i
2188 define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2189 ; CHECK-LABEL: test_vmlal_high_u32:
2190 ; CHECK: // %bb.0: // %entry
2191 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
2194 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2195 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2196 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2197 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
2198 ret <2 x i64> %add.i.i
2201 define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2202 ; CHECK-LABEL: test_vmlsl_s8:
2203 ; CHECK: // %bb.0: // %entry
2204 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
2207 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
2208 %sub.i = sub <8 x i16> %a, %vmull.i.i
2209 ret <8 x i16> %sub.i
2212 define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2213 ; CHECK-LABEL: test_vmlsl_s16:
2214 ; CHECK: // %bb.0: // %entry
2215 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
2218 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
2219 %sub.i = sub <4 x i32> %a, %vmull2.i.i
2220 ret <4 x i32> %sub.i
2223 define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2224 ; CHECK-LABEL: test_vmlsl_s32:
2225 ; CHECK: // %bb.0: // %entry
2226 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
2229 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
2230 %sub.i = sub <2 x i64> %a, %vmull2.i.i
2231 ret <2 x i64> %sub.i
2234 define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2235 ; CHECK-LABEL: test_vmlsl_u8:
2236 ; CHECK: // %bb.0: // %entry
2237 ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
2240 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
2241 %sub.i = sub <8 x i16> %a, %vmull.i.i
2242 ret <8 x i16> %sub.i
2245 define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2246 ; CHECK-LABEL: test_vmlsl_u16:
2247 ; CHECK: // %bb.0: // %entry
2248 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
2251 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
2252 %sub.i = sub <4 x i32> %a, %vmull2.i.i
2253 ret <4 x i32> %sub.i
2256 define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2257 ; CHECK-LABEL: test_vmlsl_u32:
2258 ; CHECK: // %bb.0: // %entry
2259 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
2262 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
2263 %sub.i = sub <2 x i64> %a, %vmull2.i.i
2264 ret <2 x i64> %sub.i
2267 define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2268 ; CHECK-LABEL: test_vmlsl_high_s8:
2269 ; CHECK: // %bb.0: // %entry
2270 ; CHECK-NEXT: smlsl2 v0.8h, v1.16b, v2.16b
2273 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2274 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2275 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2276 %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
2277 ret <8 x i16> %sub.i.i
2280 define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2281 ; CHECK-LABEL: test_vmlsl_high_s16:
2282 ; CHECK: // %bb.0: // %entry
2283 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.8h
2286 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2287 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2288 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2289 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
2290 ret <4 x i32> %sub.i.i
2293 define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2294 ; CHECK-LABEL: test_vmlsl_high_s32:
2295 ; CHECK: // %bb.0: // %entry
2296 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.4s
2299 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2300 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2301 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2302 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
2303 ret <2 x i64> %sub.i.i
2306 define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2307 ; CHECK-LABEL: test_vmlsl_high_u8:
2308 ; CHECK: // %bb.0: // %entry
2309 ; CHECK-NEXT: umlsl2 v0.8h, v1.16b, v2.16b
2312 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2313 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2314 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2315 %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
2316 ret <8 x i16> %sub.i.i
2319 define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2320 ; CHECK-LABEL: test_vmlsl_high_u16:
2321 ; CHECK: // %bb.0: // %entry
2322 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.8h
2325 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2326 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2327 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2328 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
2329 ret <4 x i32> %sub.i.i
2332 define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2333 ; CHECK-LABEL: test_vmlsl_high_u32:
2334 ; CHECK: // %bb.0: // %entry
2335 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.4s
2338 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2339 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2340 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2341 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
2342 ret <2 x i64> %sub.i.i
2345 define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
2346 ; CHECK-LABEL: test_vqdmull_s16:
2347 ; CHECK: // %bb.0: // %entry
2348 ; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
2351 %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
2352 ret <4 x i32> %vqdmull2.i
2355 define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
2356 ; CHECK-LABEL: test_vqdmull_s32:
2357 ; CHECK: // %bb.0: // %entry
2358 ; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s
2361 %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
2362 ret <2 x i64> %vqdmull2.i
2365 define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2366 ; CHECK-LABEL: test_vqdmlal_s16:
2367 ; CHECK: // %bb.0: // %entry
2368 ; CHECK-NEXT: sqdmlal v0.4s, v1.4h, v2.4h
2371 %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
2372 %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2373 ret <4 x i32> %vqdmlal4.i
2376 define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2377 ; CHECK-LABEL: test_vqdmlal_s32:
2378 ; CHECK: // %bb.0: // %entry
2379 ; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.2s
2382 %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
2383 %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
2384 ret <2 x i64> %vqdmlal4.i
2387 define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2388 ; CHECK-LABEL: test_vqdmlsl_s16:
2389 ; CHECK: // %bb.0: // %entry
2390 ; CHECK-NEXT: sqdmlsl v0.4s, v1.4h, v2.4h
2393 %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
2394 %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
2395 ret <4 x i32> %vqdmlsl4.i
2398 define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2399 ; CHECK-LABEL: test_vqdmlsl_s32:
2400 ; CHECK: // %bb.0: // %entry
2401 ; CHECK-NEXT: sqdmlsl v0.2d, v1.2s, v2.2s
2404 %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
2405 %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
2406 ret <2 x i64> %vqdmlsl4.i
2409 define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
2410 ; CHECK-LABEL: test_vqdmull_high_s16:
2411 ; CHECK: // %bb.0: // %entry
2412 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.8h
2415 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2416 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2417 %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2418 ret <4 x i32> %vqdmull2.i.i
2421 define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
2422 ; CHECK-LABEL: test_vqdmull_high_s32:
2423 ; CHECK: // %bb.0: // %entry
2424 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
2427 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2428 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2429 %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2430 ret <2 x i64> %vqdmull2.i.i
2433 define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2434 ; CHECK-LABEL: test_vqdmlal_high_s16:
2435 ; CHECK: // %bb.0: // %entry
2436 ; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
2439 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2440 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2441 %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2442 %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
2443 ret <4 x i32> %vqdmlal4.i.i
2446 define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2447 ; CHECK-LABEL: test_vqdmlal_high_s32:
2448 ; CHECK: // %bb.0: // %entry
2449 ; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
2452 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2453 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2454 %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2455 %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
2456 ret <2 x i64> %vqdmlal4.i.i
2459 define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2460 ; CHECK-LABEL: test_vqdmlsl_high_s16:
2461 ; CHECK: // %bb.0: // %entry
2462 ; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h
2465 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2466 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2467 %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2468 %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
2469 ret <4 x i32> %vqdmlsl4.i.i
2472 define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2473 ; CHECK-LABEL: test_vqdmlsl_high_s32:
2474 ; CHECK: // %bb.0: // %entry
2475 ; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s
2478 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2479 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2480 %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2481 %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
2482 ret <2 x i64> %vqdmlsl4.i.i
2485 define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
2486 ; CHECK-LABEL: test_vmull_p8:
2487 ; CHECK: // %bb.0: // %entry
2488 ; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
2491 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
2492 ret <8 x i16> %vmull.i
2495 define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
2496 ; CHECK-LABEL: test_vmull_high_p8:
2497 ; CHECK: // %bb.0: // %entry
2498 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b
2501 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2502 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2503 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2504 ret <8 x i16> %vmull.i.i
2507 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
2508 ; CHECK-LABEL: test_vmull_p64:
2509 ; CHECK: // %bb.0: // %entry
2510 ; CHECK-NEXT: fmov d0, x1
2511 ; CHECK-NEXT: fmov d1, x0
2512 ; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
2513 ; CHECK-NEXT: mov x1, v0.d[1]
2514 ; CHECK-NEXT: fmov x0, d0
2517 %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
2518 %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
2522 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
2523 ; CHECK-LABEL: test_vmull_high_p64:
2524 ; CHECK: // %bb.0: // %entry
2525 ; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
2526 ; CHECK-NEXT: mov x1, v0.d[1]
2527 ; CHECK-NEXT: fmov x0, d0
2530 %0 = extractelement <2 x i64> %a, i32 1
2531 %1 = extractelement <2 x i64> %b, i32 1
2532 %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
2533 %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
2534 ret i128 %vmull3.i.i
2537 define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) {
2538 ; CHECK-LABEL: cmplx_mul_combined_re_im:
2539 ; CHECK: // %bb.0: // %entry
2540 ; CHECK-NEXT: lsr x9, x0, #16
2541 ; CHECK-NEXT: adrp x8, .LCPI196_0
2542 ; CHECK-NEXT: fmov d5, x0
2543 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI196_0]
2544 ; CHECK-NEXT: rev32 v4.8h, v0.8h
2545 ; CHECK-NEXT: dup v1.8h, w9
2546 ; CHECK-NEXT: sqneg v2.8h, v1.8h
2547 ; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
2548 ; CHECK-NEXT: sqdmull v2.4s, v0.4h, v5.h[0]
2549 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v5.h[0]
2550 ; CHECK-NEXT: sqdmlal v2.4s, v4.4h, v1.4h
2551 ; CHECK-NEXT: sqdmlal2 v0.4s, v4.8h, v1.8h
2552 ; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h
2555 %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16
2556 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
2557 %vec.scale.coerce = bitcast i64 %scale.coerce to <4 x i16>
2558 %vec.scale.sroa.2.0.extract.shift23 = bitcast i64 %scale.sroa.2.0.extract.shift23 to <4 x i16>
2559 %vecinit7.i25 = shufflevector <4 x i16> %vec.scale.sroa.2.0.extract.shift23, <4 x i16> poison, <8 x i32> zeroinitializer
2560 %vqnegq_v1.i = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %vecinit7.i25)
2561 %0 = shufflevector <8 x i16> %vqnegq_v1.i, <8 x i16> %vecinit7.i25, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
2562 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2563 %shuffle.i3.i = shufflevector <4 x i16> %vec.scale.coerce, <4 x i16> poison, <4 x i32> zeroinitializer
2564 %vqdmull_v2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2565 %shuffle.i.i26 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2566 %vqdmull_v2.i.i28 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i26, <4 x i16> %shuffle.i3.i)
2567 %shuffle.i.i29 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2568 %shuffle.i3.i30 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2569 %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i29, <4 x i16> %shuffle.i3.i30)
2570 %vqdmlal_v3.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i.i, <4 x i32> %vqdmlal2.i.i)
2571 %shuffle.i.i31 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2572 %shuffle.i3.i32 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2573 %vqdmlal2.i.i33 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i31, <4 x i16> %shuffle.i3.i32)
2574 %vqdmlal_v3.i.i34 = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i.i28, <4 x i32> %vqdmlal2.i.i33)
2575 %1 = bitcast <4 x i32> %vqdmlal_v3.i.i to <8 x i16>
2576 %2 = bitcast <4 x i32> %vqdmlal_v3.i.i34 to <8 x i16>
2577 %shuffle.i35 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2578 ret <8 x i16> %shuffle.i35
2581 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>)