1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \
3 ; RUN: < %s -verify-machineinstrs | FileCheck %s
5 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
6 ; CHECK-LABEL: test_vmull_high_n_s16:
7 ; CHECK: // %bb.0: // %entry
8 ; CHECK-NEXT: dup v1.8h, w0
9 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
12 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
13 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
14 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
15 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
16 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
17 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
18 ret <4 x i32> %vmull15.i.i
21 define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 {
22 ; CHECK-LABEL: test_vmull_high_n_s16_imm:
23 ; CHECK: // %bb.0: // %entry
24 ; CHECK-NEXT: movi v1.8h, #29
25 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
28 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
29 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
30 ret <4 x i32> %vmull15.i.i
33 define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
34 ; CHECK-LABEL: test_vmull_high_n_s32:
35 ; CHECK: // %bb.0: // %entry
36 ; CHECK-NEXT: dup v1.4s, w0
37 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
40 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
41 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
42 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
43 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
44 ret <2 x i64> %vmull9.i.i
47 define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 {
48 ; CHECK-LABEL: test_vmull_high_n_s32_imm:
49 ; CHECK: // %bb.0: // %entry
50 ; CHECK-NEXT: movi v1.4s, #1, msl #8
51 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
54 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
55 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>)
56 ret <2 x i64> %vmull9.i.i
59 define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
60 ; CHECK-LABEL: test_vmull_high_n_u16:
61 ; CHECK: // %bb.0: // %entry
62 ; CHECK-NEXT: dup v1.8h, w0
63 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
66 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
67 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
68 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
69 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
70 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
71 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
72 ret <4 x i32> %vmull15.i.i
75 define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 {
76 ; CHECK-LABEL: test_vmull_high_n_u16_imm:
77 ; CHECK: // %bb.0: // %entry
78 ; CHECK-NEXT: movi v1.8h, #17, lsl #8
79 ; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
82 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
83 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>)
84 ret <4 x i32> %vmull15.i.i
87 define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
88 ; CHECK-LABEL: test_vmull_high_n_u32:
89 ; CHECK: // %bb.0: // %entry
90 ; CHECK-NEXT: dup v1.4s, w0
91 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
94 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
95 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
96 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
97 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
98 ret <2 x i64> %vmull9.i.i
101 define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 {
102 ; CHECK-LABEL: test_vmull_high_n_u32_imm:
103 ; CHECK: // %bb.0: // %entry
104 ; CHECK-NEXT: mvni v1.4s, #1, msl #8
105 ; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
108 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
109 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>)
110 ret <2 x i64> %vmull9.i.i
113 define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
114 ; CHECK-LABEL: test_vqdmull_high_n_s16:
115 ; CHECK: // %bb.0: // %entry
116 ; CHECK-NEXT: dup v1.8h, w0
117 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.8h
120 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
121 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
122 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
123 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
124 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
125 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
126 ret <4 x i32> %vqdmull15.i.i
129 define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 {
130 ; CHECK-LABEL: test_vqdmull_high_n_s16_imm:
131 ; CHECK: // %bb.0: // %entry
132 ; CHECK-NEXT: mvni v1.8h, #17, lsl #8
133 ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v1.8h
136 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
137 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>)
138 ret <4 x i32> %vqdmull15.i.i
141 define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
142 ; CHECK-LABEL: test_vqdmull_high_n_s32:
143 ; CHECK: // %bb.0: // %entry
144 ; CHECK-NEXT: dup v1.4s, w0
145 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
148 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
149 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
150 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
151 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
152 ret <2 x i64> %vqdmull9.i.i
155 define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 {
156 ; CHECK-LABEL: test_vqdmull_high_n_s32_imm:
157 ; CHECK: // %bb.0: // %entry
158 ; CHECK-NEXT: movi v1.4s, #29
159 ; CHECK-NEXT: sqdmull2 v0.2d, v0.4s, v1.4s
162 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
163 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
164 ret <2 x i64> %vqdmull9.i.i
167 define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
168 ; CHECK-LABEL: test_vmlal_high_n_s16:
169 ; CHECK: // %bb.0: // %entry
170 ; CHECK-NEXT: dup v2.8h, w0
171 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.8h
174 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
175 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
176 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
177 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
178 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
179 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
180 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
181 ret <4 x i32> %add.i.i
184 define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
185 ; CHECK-LABEL: test_vmlal_high_n_s16_imm:
186 ; CHECK: // %bb.0: // %entry
187 ; CHECK-NEXT: movi v2.8h, #29
188 ; CHECK-NEXT: smlal2 v0.4s, v1.8h, v2.8h
191 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
192 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
193 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
194 ret <4 x i32> %add.i.i
197 define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
198 ; CHECK-LABEL: test_vmlal_high_n_s32:
199 ; CHECK: // %bb.0: // %entry
200 ; CHECK-NEXT: dup v2.4s, w0
201 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
204 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
205 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
206 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
207 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
208 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
209 ret <2 x i64> %add.i.i
212 define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
213 ; CHECK-LABEL: test_vmlal_high_n_s32_imm:
214 ; CHECK: // %bb.0: // %entry
215 ; CHECK-NEXT: movi v2.4s, #29
216 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
219 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
220 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
221 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
222 ret <2 x i64> %add.i.i
225 define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
226 ; CHECK-LABEL: test_vmlal_high_n_u16:
227 ; CHECK: // %bb.0: // %entry
228 ; CHECK-NEXT: dup v2.8h, w0
229 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.8h
232 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
233 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
234 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
235 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
236 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
237 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
238 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
239 ret <4 x i32> %add.i.i
242 define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
243 ; CHECK-LABEL: test_vmlal_high_n_u16_imm:
244 ; CHECK: // %bb.0: // %entry
245 ; CHECK-NEXT: movi v2.8h, #29
246 ; CHECK-NEXT: umlal2 v0.4s, v1.8h, v2.8h
249 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
250 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
251 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
252 ret <4 x i32> %add.i.i
255 define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
256 ; CHECK-LABEL: test_vmlal_high_n_u32:
257 ; CHECK: // %bb.0: // %entry
258 ; CHECK-NEXT: dup v2.4s, w0
259 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
262 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
263 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
264 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
265 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
266 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
267 ret <2 x i64> %add.i.i
270 define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
271 ; CHECK-LABEL: test_vmlal_high_n_u32_imm:
272 ; CHECK: // %bb.0: // %entry
273 ; CHECK-NEXT: movi v2.4s, #29
274 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
277 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
278 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
279 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
280 ret <2 x i64> %add.i.i
283 define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
284 ; CHECK-LABEL: test_vqdmlal_high_n_s16:
285 ; CHECK: // %bb.0: // %entry
286 ; CHECK-NEXT: dup v2.8h, w0
287 ; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
290 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
291 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
292 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
293 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
294 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
295 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
296 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
297 ret <4 x i32> %vqdmlal17.i.i
300 define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
301 ; CHECK-LABEL: test_vqdmlal_high_n_s16_imm:
302 ; CHECK: // %bb.0: // %entry
303 ; CHECK-NEXT: movi v2.8h, #29
304 ; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h
307 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
308 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
309 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
310 ret <4 x i32> %vqdmlal17.i.i
313 define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
314 ; CHECK-LABEL: test_vqdmlal_high_n_s32:
315 ; CHECK: // %bb.0: // %entry
316 ; CHECK-NEXT: dup v2.4s, w0
317 ; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
320 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
321 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
322 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
323 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
324 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
325 ret <2 x i64> %vqdmlal11.i.i
328 define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
329 ; CHECK-LABEL: test_vqdmlal_high_n_s32_imm:
330 ; CHECK: // %bb.0: // %entry
331 ; CHECK-NEXT: movi v2.4s, #29
332 ; CHECK-NEXT: sqdmlal2 v0.2d, v1.4s, v2.4s
335 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
336 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
337 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
338 ret <2 x i64> %vqdmlal11.i.i
341 define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
342 ; CHECK-LABEL: test_vmlsl_high_n_s16:
343 ; CHECK: // %bb.0: // %entry
344 ; CHECK-NEXT: dup v2.8h, w0
345 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.8h
348 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
349 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
350 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
351 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
352 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
353 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
354 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
355 ret <4 x i32> %sub.i.i
358 define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
359 ; CHECK-LABEL: test_vmlsl_high_n_s16_imm:
360 ; CHECK: // %bb.0: // %entry
361 ; CHECK-NEXT: movi v2.8h, #29
362 ; CHECK-NEXT: smlsl2 v0.4s, v1.8h, v2.8h
365 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
366 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
367 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
368 ret <4 x i32> %sub.i.i
371 define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
372 ; CHECK-LABEL: test_vmlsl_high_n_s32:
373 ; CHECK: // %bb.0: // %entry
374 ; CHECK-NEXT: dup v2.4s, w0
375 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.4s
378 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
379 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
380 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
381 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
382 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
383 ret <2 x i64> %sub.i.i
386 define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
387 ; CHECK-LABEL: test_vmlsl_high_n_s32_imm:
388 ; CHECK: // %bb.0: // %entry
389 ; CHECK-NEXT: movi v2.4s, #29
390 ; CHECK-NEXT: smlsl2 v0.2d, v1.4s, v2.4s
393 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
394 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
395 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
396 ret <2 x i64> %sub.i.i
399 define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
400 ; CHECK-LABEL: test_vmlsl_high_n_u16:
401 ; CHECK: // %bb.0: // %entry
402 ; CHECK-NEXT: dup v2.8h, w0
403 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.8h
406 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
408 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
409 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
410 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
411 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
412 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
413 ret <4 x i32> %sub.i.i
416 define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
417 ; CHECK-LABEL: test_vmlsl_high_n_u16_imm:
418 ; CHECK: // %bb.0: // %entry
419 ; CHECK-NEXT: movi v2.8h, #29
420 ; CHECK-NEXT: umlsl2 v0.4s, v1.8h, v2.8h
423 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
424 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
425 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
426 ret <4 x i32> %sub.i.i
429 define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
430 ; CHECK-LABEL: test_vmlsl_high_n_u32:
431 ; CHECK: // %bb.0: // %entry
432 ; CHECK-NEXT: dup v2.4s, w0
433 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.4s
436 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
437 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
438 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
439 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
440 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
441 ret <2 x i64> %sub.i.i
444 define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
445 ; CHECK-LABEL: test_vmlsl_high_n_u32_imm:
446 ; CHECK: // %bb.0: // %entry
447 ; CHECK-NEXT: movi v2.4s, #29
448 ; CHECK-NEXT: umlsl2 v0.2d, v1.4s, v2.4s
451 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
452 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
453 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
454 ret <2 x i64> %sub.i.i
457 define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
458 ; CHECK-LABEL: test_vqdmlsl_high_n_s16:
459 ; CHECK: // %bb.0: // %entry
460 ; CHECK-NEXT: dup v2.8h, w0
461 ; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h
464 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
465 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
466 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
467 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
468 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
469 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
470 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
471 ret <4 x i32> %vqdmlsl17.i.i
474 define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
475 ; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm:
476 ; CHECK: // %bb.0: // %entry
477 ; CHECK-NEXT: movi v2.8h, #29
478 ; CHECK-NEXT: sqdmlsl2 v0.4s, v1.8h, v2.8h
481 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
482 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
483 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
484 ret <4 x i32> %vqdmlsl17.i.i
487 define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
488 ; CHECK-LABEL: test_vqdmlsl_high_n_s32:
489 ; CHECK: // %bb.0: // %entry
490 ; CHECK-NEXT: dup v2.4s, w0
491 ; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s
494 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
495 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
496 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
497 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
498 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
499 ret <2 x i64> %vqdmlsl11.i.i
502 define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
503 ; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm:
504 ; CHECK: // %bb.0: // %entry
505 ; CHECK-NEXT: movi v2.4s, #29
506 ; CHECK-NEXT: sqdmlsl2 v0.2d, v1.4s, v2.4s
509 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
510 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
511 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
512 ret <2 x i64> %vqdmlsl11.i.i
515 define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
516 ; CHECK-LABEL: test_vmul_n_f32:
517 ; CHECK: // %bb.0: // %entry
518 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
519 ; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0]
522 %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
523 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
524 %mul.i = fmul <2 x float> %vecinit1.i, %a
525 ret <2 x float> %mul.i
528 define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
529 ; CHECK-LABEL: test_vmulq_n_f32:
530 ; CHECK: // %bb.0: // %entry
531 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
532 ; CHECK-NEXT: fmul v0.4s, v0.4s, v1.s[0]
535 %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
536 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
537 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
538 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
539 %mul.i = fmul <4 x float> %vecinit3.i, %a
540 ret <4 x float> %mul.i
543 define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
544 ; CHECK-LABEL: test_vmulq_n_f64:
545 ; CHECK: // %bb.0: // %entry
546 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
547 ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.d[0]
550 %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
551 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
552 %mul.i = fmul <2 x double> %vecinit1.i, %a
553 ret <2 x double> %mul.i
556 define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
557 ; CHECK-LABEL: test_vfma_n_f32:
558 ; CHECK: // %bb.0: // %entry
559 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
560 ; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[0]
563 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
564 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
565 %0 = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
569 define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
570 ; CHECK-LABEL: test_vfmaq_n_f32:
571 ; CHECK: // %bb.0: // %entry
572 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
573 ; CHECK-NEXT: fmla v0.4s, v1.4s, v2.s[0]
576 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
577 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
578 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
579 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
580 %0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
584 define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
585 ; CHECK-LABEL: test_vfms_n_f32:
586 ; CHECK: // %bb.0: // %entry
587 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
588 ; CHECK-NEXT: fmls v0.2s, v1.2s, v2.s[0]
591 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
592 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
593 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
594 %1 = call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
598 define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
599 ; CHECK-LABEL: test_vfmsq_n_f32:
600 ; CHECK: // %bb.0: // %entry
601 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
602 ; CHECK-NEXT: fmls v0.4s, v1.4s, v2.s[0]
605 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
606 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
607 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
608 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
609 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
610 %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
614 attributes #0 = { nounwind }
616 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
617 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
618 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
619 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
620 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
621 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
622 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
623 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
624 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
625 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
626 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
627 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)