1 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \
2 ; RUN: < %s -verify-machineinstrs -asm-verbose=false | FileCheck %s
4 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
5 ; CHECK-LABEL: test_vmull_high_n_s16:
6 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
7 ; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
10 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
11 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
12 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
13 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
14 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
15 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
16 ret <4 x i32> %vmull15.i.i
19 define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 {
20 ; CHECK-LABEL: test_vmull_high_n_s16_imm:
21 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
22 ; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
25 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
26 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
27 ret <4 x i32> %vmull15.i.i
30 define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
31 ; CHECK-LABEL: test_vmull_high_n_s32:
32 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
33 ; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
36 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
37 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
38 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
39 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
40 ret <2 x i64> %vmull9.i.i
43 define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 {
44 ; CHECK-LABEL: test_vmull_high_n_s32_imm:
45 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8
46 ; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
49 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
50 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>)
51 ret <2 x i64> %vmull9.i.i
54 define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
55 ; CHECK-LABEL: test_vmull_high_n_u16:
56 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
57 ; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
60 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
61 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
62 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
63 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
64 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
65 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
66 ret <4 x i32> %vmull15.i.i
69 define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 {
70 ; CHECK-LABEL: test_vmull_high_n_u16_imm:
71 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
72 ; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
75 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
76 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>)
77 ret <4 x i32> %vmull15.i.i
80 define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
81 ; CHECK-LABEL: test_vmull_high_n_u32:
82 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
83 ; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
86 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
87 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
88 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
89 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
90 ret <2 x i64> %vmull9.i.i
93 define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 {
94 ; CHECK-LABEL: test_vmull_high_n_u32_imm:
95 ; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8
96 ; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
99 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
100 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>)
101 ret <2 x i64> %vmull9.i.i
104 define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
105 ; CHECK-LABEL: test_vqdmull_high_n_s16:
106 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
107 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
110 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
111 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
112 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
113 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
114 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
115 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
116 ret <4 x i32> %vqdmull15.i.i
119 define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 {
120 ; CHECK-LABEL: test_vqdmull_high_n_s16_imm:
121 ; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
122 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
125 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>)
127 ret <4 x i32> %vqdmull15.i.i
130 define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
131 ; CHECK-LABEL: test_vqdmull_high_n_s32:
132 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
133 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
136 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
137 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
138 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
139 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
140 ret <2 x i64> %vqdmull9.i.i
143 define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 {
144 ; CHECK-LABEL: test_vqdmull_high_n_s32_imm:
145 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
146 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
149 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
150 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
151 ret <2 x i64> %vqdmull9.i.i
154 define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
155 ; CHECK-LABEL: test_vmlal_high_n_s16:
156 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
157 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
160 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
161 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
162 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
163 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
164 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
165 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
166 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
167 ret <4 x i32> %add.i.i
170 define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
171 ; CHECK-LABEL: test_vmlal_high_n_s16_imm:
172 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
173 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
176 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
177 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
178 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
179 ret <4 x i32> %add.i.i
182 define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
183 ; CHECK-LABEL: test_vmlal_high_n_s32:
184 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
185 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
188 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
189 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
190 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
191 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
192 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
193 ret <2 x i64> %add.i.i
196 define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
197 ; CHECK-LABEL: test_vmlal_high_n_s32_imm:
198 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
199 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
202 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
203 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
204 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
205 ret <2 x i64> %add.i.i
208 define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
209 ; CHECK-LABEL: test_vmlal_high_n_u16:
210 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
211 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
214 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
215 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
216 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
217 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
218 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
219 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
220 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
221 ret <4 x i32> %add.i.i
224 define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
225 ; CHECK-LABEL: test_vmlal_high_n_u16_imm:
226 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
227 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
230 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
231 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
232 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
233 ret <4 x i32> %add.i.i
236 define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
237 ; CHECK-LABEL: test_vmlal_high_n_u32:
238 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
239 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
242 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
243 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
244 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
245 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
246 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
247 ret <2 x i64> %add.i.i
250 define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
251 ; CHECK-LABEL: test_vmlal_high_n_u32_imm:
252 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
253 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
256 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
257 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
258 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
259 ret <2 x i64> %add.i.i
262 define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
263 ; CHECK-LABEL: test_vqdmlal_high_n_s16:
264 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
265 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
268 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
269 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
270 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
271 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
272 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
273 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
274 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
275 ret <4 x i32> %vqdmlal17.i.i
278 define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
279 ; CHECK-LABEL: test_vqdmlal_high_n_s16_imm:
280 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
281 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
284 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
285 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
286 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
287 ret <4 x i32> %vqdmlal17.i.i
290 define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
291 ; CHECK-LABEL: test_vqdmlal_high_n_s32:
292 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
293 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
296 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
297 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
298 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
299 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
300 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
301 ret <2 x i64> %vqdmlal11.i.i
304 define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
305 ; CHECK-LABEL: test_vqdmlal_high_n_s32_imm:
306 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
307 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
310 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
311 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
312 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
313 ret <2 x i64> %vqdmlal11.i.i
316 define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
317 ; CHECK-LABEL: test_vmlsl_high_n_s16:
318 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
319 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
322 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
323 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
324 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
325 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
326 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
327 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
328 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
329 ret <4 x i32> %sub.i.i
332 define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
333 ; CHECK-LABEL: test_vmlsl_high_n_s16_imm:
334 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
335 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
338 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
340 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
341 ret <4 x i32> %sub.i.i
344 define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
345 ; CHECK-LABEL: test_vmlsl_high_n_s32:
346 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
347 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
350 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
351 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
352 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
353 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
354 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
355 ret <2 x i64> %sub.i.i
358 define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
359 ; CHECK-LABEL: test_vmlsl_high_n_s32_imm:
360 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
361 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
364 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
365 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
366 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
367 ret <2 x i64> %sub.i.i
370 define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
371 ; CHECK-LABEL: test_vmlsl_high_n_u16:
372 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
373 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
376 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
377 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
378 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
379 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
380 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
381 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
382 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
383 ret <4 x i32> %sub.i.i
386 define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
387 ; CHECK-LABEL: test_vmlsl_high_n_u16_imm:
388 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
389 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
392 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
393 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
394 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
395 ret <4 x i32> %sub.i.i
398 define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
399 ; CHECK-LABEL: test_vmlsl_high_n_u32:
400 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
401 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
404 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
405 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
406 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
407 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
408 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
409 ret <2 x i64> %sub.i.i
412 define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
413 ; CHECK-LABEL: test_vmlsl_high_n_u32_imm:
414 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
415 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
418 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
419 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
420 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
421 ret <2 x i64> %sub.i.i
424 define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
425 ; CHECK-LABEL: test_vqdmlsl_high_n_s16:
426 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
427 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
430 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
431 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
432 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
433 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
434 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
435 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
436 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
437 ret <4 x i32> %vqdmlsl17.i.i
440 define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
441 ; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm:
442 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
443 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
446 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
447 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
448 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
449 ret <4 x i32> %vqdmlsl17.i.i
452 define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
453 ; CHECK-LABEL: test_vqdmlsl_high_n_s32:
454 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
455 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
458 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
459 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
460 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
461 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
462 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
463 ret <2 x i64> %vqdmlsl11.i.i
466 define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
467 ; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm:
468 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
469 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
472 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
473 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
474 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
475 ret <2 x i64> %vqdmlsl11.i.i
478 define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
479 ; CHECK-LABEL: test_vmul_n_f32:
480 ; CHECK-NEXT: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
483 %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
484 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
485 %mul.i = fmul <2 x float> %vecinit1.i, %a
486 ret <2 x float> %mul.i
489 define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
490 ; CHECK-LABEL: test_vmulq_n_f32:
491 ; CHECK-NEXT: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
494 %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
495 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
496 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
497 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
498 %mul.i = fmul <4 x float> %vecinit3.i, %a
499 ret <4 x float> %mul.i
502 define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
503 ; CHECK-LABEL: test_vmulq_n_f64:
504 ; CHECK-NEXT: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
507 %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
508 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
509 %mul.i = fmul <2 x double> %vecinit1.i, %a
510 ret <2 x double> %mul.i
513 define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
514 ; CHECK-LABEL: test_vfma_n_f32:
515 ; CHECK-NEXT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
518 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
519 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
520 %0 = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
524 define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
525 ; CHECK-LABEL: test_vfmaq_n_f32:
526 ; CHECK-NEXT: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
529 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
530 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
531 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
532 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
533 %0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
537 define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
538 ; CHECK-LABEL: test_vfms_n_f32:
539 ; CHECK-NEXT: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
542 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
543 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
544 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
545 %1 = call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
549 define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
550 ; CHECK-LABEL: test_vfmsq_n_f32:
551 ; CHECK-NEXT: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
554 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
555 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
556 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
557 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
558 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
559 %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
563 attributes #0 = { nounwind }
565 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
566 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
567 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
568 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
569 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
570 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
571 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
572 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
573 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
574 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
575 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
576 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)