1 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
3 define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4 ; CHECK-LABEL: smull_v8i8_v8i16:
5 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
6 %tmp1 = load <8 x i8>, <8 x i8>* %A
7 %tmp2 = load <8 x i8>, <8 x i8>* %B
8 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
9 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
10 %tmp5 = mul <8 x i16> %tmp3, %tmp4
14 define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15 ; CHECK-LABEL: smull_v4i16_v4i32:
16 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
17 %tmp1 = load <4 x i16>, <4 x i16>* %A
18 %tmp2 = load <4 x i16>, <4 x i16>* %B
19 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
20 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
21 %tmp5 = mul <4 x i32> %tmp3, %tmp4
25 define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
26 ; CHECK-LABEL: smull_v2i32_v2i64:
27 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
28 %tmp1 = load <2 x i32>, <2 x i32>* %A
29 %tmp2 = load <2 x i32>, <2 x i32>* %B
30 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
31 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
32 %tmp5 = mul <2 x i64> %tmp3, %tmp4
36 define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
37 ; CHECK-LABEL: umull_v8i8_v8i16:
38 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
39 %tmp1 = load <8 x i8>, <8 x i8>* %A
40 %tmp2 = load <8 x i8>, <8 x i8>* %B
41 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
42 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
43 %tmp5 = mul <8 x i16> %tmp3, %tmp4
47 define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
48 ; CHECK-LABEL: umull_v4i16_v4i32:
49 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
50 %tmp1 = load <4 x i16>, <4 x i16>* %A
51 %tmp2 = load <4 x i16>, <4 x i16>* %B
52 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
53 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
54 %tmp5 = mul <4 x i32> %tmp3, %tmp4
58 define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
59 ; CHECK-LABEL: umull_v2i32_v2i64:
60 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
61 %tmp1 = load <2 x i32>, <2 x i32>* %A
62 %tmp2 = load <2 x i32>, <2 x i32>* %B
63 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
64 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
65 %tmp5 = mul <2 x i64> %tmp3, %tmp4
69 define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
70 ; CHECK-LABEL: smlal_v8i8_v8i16:
71 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
72 %tmp1 = load <8 x i16>, <8 x i16>* %A
73 %tmp2 = load <8 x i8>, <8 x i8>* %B
74 %tmp3 = load <8 x i8>, <8 x i8>* %C
75 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
76 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
77 %tmp6 = mul <8 x i16> %tmp4, %tmp5
78 %tmp7 = add <8 x i16> %tmp1, %tmp6
82 define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
83 ; CHECK-LABEL: smlal_v4i16_v4i32:
84 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
85 %tmp1 = load <4 x i32>, <4 x i32>* %A
86 %tmp2 = load <4 x i16>, <4 x i16>* %B
87 %tmp3 = load <4 x i16>, <4 x i16>* %C
88 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
89 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
90 %tmp6 = mul <4 x i32> %tmp4, %tmp5
91 %tmp7 = add <4 x i32> %tmp1, %tmp6
95 define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
96 ; CHECK-LABEL: smlal_v2i32_v2i64:
97 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
98 %tmp1 = load <2 x i64>, <2 x i64>* %A
99 %tmp2 = load <2 x i32>, <2 x i32>* %B
100 %tmp3 = load <2 x i32>, <2 x i32>* %C
101 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
102 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
103 %tmp6 = mul <2 x i64> %tmp4, %tmp5
104 %tmp7 = add <2 x i64> %tmp1, %tmp6
108 define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
109 ; CHECK-LABEL: umlal_v8i8_v8i16:
110 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
111 %tmp1 = load <8 x i16>, <8 x i16>* %A
112 %tmp2 = load <8 x i8>, <8 x i8>* %B
113 %tmp3 = load <8 x i8>, <8 x i8>* %C
114 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
115 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
116 %tmp6 = mul <8 x i16> %tmp4, %tmp5
117 %tmp7 = add <8 x i16> %tmp1, %tmp6
121 define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
122 ; CHECK-LABEL: umlal_v4i16_v4i32:
123 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
124 %tmp1 = load <4 x i32>, <4 x i32>* %A
125 %tmp2 = load <4 x i16>, <4 x i16>* %B
126 %tmp3 = load <4 x i16>, <4 x i16>* %C
127 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
128 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
129 %tmp6 = mul <4 x i32> %tmp4, %tmp5
130 %tmp7 = add <4 x i32> %tmp1, %tmp6
134 define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
135 ; CHECK-LABEL: umlal_v2i32_v2i64:
136 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
137 %tmp1 = load <2 x i64>, <2 x i64>* %A
138 %tmp2 = load <2 x i32>, <2 x i32>* %B
139 %tmp3 = load <2 x i32>, <2 x i32>* %C
140 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
141 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
142 %tmp6 = mul <2 x i64> %tmp4, %tmp5
143 %tmp7 = add <2 x i64> %tmp1, %tmp6
147 define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
148 ; CHECK-LABEL: smlsl_v8i8_v8i16:
149 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
150 %tmp1 = load <8 x i16>, <8 x i16>* %A
151 %tmp2 = load <8 x i8>, <8 x i8>* %B
152 %tmp3 = load <8 x i8>, <8 x i8>* %C
153 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
154 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
155 %tmp6 = mul <8 x i16> %tmp4, %tmp5
156 %tmp7 = sub <8 x i16> %tmp1, %tmp6
160 define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
161 ; CHECK-LABEL: smlsl_v4i16_v4i32:
162 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
163 %tmp1 = load <4 x i32>, <4 x i32>* %A
164 %tmp2 = load <4 x i16>, <4 x i16>* %B
165 %tmp3 = load <4 x i16>, <4 x i16>* %C
166 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
167 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
168 %tmp6 = mul <4 x i32> %tmp4, %tmp5
169 %tmp7 = sub <4 x i32> %tmp1, %tmp6
173 define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
174 ; CHECK-LABEL: smlsl_v2i32_v2i64:
175 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
176 %tmp1 = load <2 x i64>, <2 x i64>* %A
177 %tmp2 = load <2 x i32>, <2 x i32>* %B
178 %tmp3 = load <2 x i32>, <2 x i32>* %C
179 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
180 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
181 %tmp6 = mul <2 x i64> %tmp4, %tmp5
182 %tmp7 = sub <2 x i64> %tmp1, %tmp6
186 define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
187 ; CHECK-LABEL: umlsl_v8i8_v8i16:
188 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
189 %tmp1 = load <8 x i16>, <8 x i16>* %A
190 %tmp2 = load <8 x i8>, <8 x i8>* %B
191 %tmp3 = load <8 x i8>, <8 x i8>* %C
192 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
193 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
194 %tmp6 = mul <8 x i16> %tmp4, %tmp5
195 %tmp7 = sub <8 x i16> %tmp1, %tmp6
199 define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
200 ; CHECK-LABEL: umlsl_v4i16_v4i32:
201 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
202 %tmp1 = load <4 x i32>, <4 x i32>* %A
203 %tmp2 = load <4 x i16>, <4 x i16>* %B
204 %tmp3 = load <4 x i16>, <4 x i16>* %C
205 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
206 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
207 %tmp6 = mul <4 x i32> %tmp4, %tmp5
208 %tmp7 = sub <4 x i32> %tmp1, %tmp6
212 define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
213 ; CHECK-LABEL: umlsl_v2i32_v2i64:
214 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
215 %tmp1 = load <2 x i64>, <2 x i64>* %A
216 %tmp2 = load <2 x i32>, <2 x i32>* %B
217 %tmp3 = load <2 x i32>, <2 x i32>* %C
218 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
219 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
220 %tmp6 = mul <2 x i64> %tmp4, %tmp5
221 %tmp7 = sub <2 x i64> %tmp1, %tmp6
225 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
226 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
227 ; CHECK-LABEL: smull_extvec_v8i8_v8i16:
228 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
229 %tmp3 = sext <8 x i8> %arg to <8 x i16>
230 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
234 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
235 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
236 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
238 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
239 %tmp3 = sext <8 x i8> %arg to <8 x i16>
240 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
244 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
245 ; CHECK-LABEL: smull_extvec_v4i16_v4i32:
246 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
247 %tmp3 = sext <4 x i16> %arg to <4 x i32>
248 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
252 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
253 ; CHECK: smull_extvec_v2i32_v2i64
254 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
255 %tmp3 = sext <2 x i32> %arg to <2 x i64>
256 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
260 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
261 ; CHECK-LABEL: umull_extvec_v8i8_v8i16:
262 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
263 %tmp3 = zext <8 x i8> %arg to <8 x i16>
264 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
268 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
269 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
270 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
272 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
273 %tmp3 = zext <8 x i8> %arg to <8 x i16>
274 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
278 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
279 ; CHECK-LABEL: umull_extvec_v4i16_v4i32:
280 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
281 %tmp3 = zext <4 x i16> %arg to <4 x i32>
282 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
286 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
287 ; CHECK-LABEL: umull_extvec_v2i32_v2i64:
288 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
289 %tmp3 = zext <2 x i32> %arg to <2 x i64>
290 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
294 define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
295 ; If one operand has a zero-extend and the other a sign-extend, smull
297 ; CHECK-LABEL: smullWithInconsistentExtensions:
298 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
299 %s = sext <8 x i8> %x to <8 x i16>
300 %z = zext <8 x i8> %y to <8 x i16>
301 %m = mul <8 x i16> %s, %z
302 %r = extractelement <8 x i16> %m, i32 0
306 define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
308 ; CHECK-LABEL: distribute:
309 ; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
310 ; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
311 %0 = trunc i32 %mul to i8
312 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
313 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
314 %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
315 %4 = bitcast <16 x i8> %3 to <2 x double>
316 %5 = extractelement <2 x double> %4, i32 1
317 %6 = bitcast double %5 to <8 x i8>
318 %7 = zext <8 x i8> %6 to <8 x i16>
319 %8 = zext <8 x i8> %2 to <8 x i16>
320 %9 = extractelement <2 x double> %4, i32 0
321 %10 = bitcast double %9 to <8 x i8>
322 %11 = zext <8 x i8> %10 to <8 x i16>
323 %12 = add <8 x i16> %7, %11
324 %13 = mul <8 x i16> %12, %8
325 %14 = bitcast i16* %dst to i8*
326 tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
330 declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
332 declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind