1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
4 define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 ; CHECK-LABEL: smull_v8i8_v8i16:
7 ; CHECK-NEXT: ldr d0, [x0]
8 ; CHECK-NEXT: ldr d1, [x1]
9 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
11 %tmp1 = load <8 x i8>, <8 x i8>* %A
12 %tmp2 = load <8 x i8>, <8 x i8>* %B
13 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
14 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
15 %tmp5 = mul <8 x i16> %tmp3, %tmp4
19 define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
20 ; CHECK-LABEL: smull_v4i16_v4i32:
22 ; CHECK-NEXT: ldr d0, [x0]
23 ; CHECK-NEXT: ldr d1, [x1]
24 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
26 %tmp1 = load <4 x i16>, <4 x i16>* %A
27 %tmp2 = load <4 x i16>, <4 x i16>* %B
28 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
29 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
30 %tmp5 = mul <4 x i32> %tmp3, %tmp4
34 define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
35 ; CHECK-LABEL: smull_v2i32_v2i64:
37 ; CHECK-NEXT: ldr d0, [x0]
38 ; CHECK-NEXT: ldr d1, [x1]
39 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
41 %tmp1 = load <2 x i32>, <2 x i32>* %A
42 %tmp2 = load <2 x i32>, <2 x i32>* %B
43 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
44 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
45 %tmp5 = mul <2 x i64> %tmp3, %tmp4
49 define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
50 ; CHECK-LABEL: umull_v8i8_v8i16:
52 ; CHECK-NEXT: ldr d0, [x0]
53 ; CHECK-NEXT: ldr d1, [x1]
54 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
56 %tmp1 = load <8 x i8>, <8 x i8>* %A
57 %tmp2 = load <8 x i8>, <8 x i8>* %B
58 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
59 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
60 %tmp5 = mul <8 x i16> %tmp3, %tmp4
64 define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
65 ; CHECK-LABEL: umull_v4i16_v4i32:
67 ; CHECK-NEXT: ldr d0, [x0]
68 ; CHECK-NEXT: ldr d1, [x1]
69 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
71 %tmp1 = load <4 x i16>, <4 x i16>* %A
72 %tmp2 = load <4 x i16>, <4 x i16>* %B
73 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
74 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
75 %tmp5 = mul <4 x i32> %tmp3, %tmp4
79 define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
80 ; CHECK-LABEL: umull_v2i32_v2i64:
82 ; CHECK-NEXT: ldr d0, [x0]
83 ; CHECK-NEXT: ldr d1, [x1]
84 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
86 %tmp1 = load <2 x i32>, <2 x i32>* %A
87 %tmp2 = load <2 x i32>, <2 x i32>* %B
88 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
89 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
90 %tmp5 = mul <2 x i64> %tmp3, %tmp4
94 define <8 x i16> @amull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
95 ; CHECK-LABEL: amull_v8i8_v8i16:
97 ; CHECK-NEXT: ldr d0, [x0]
98 ; CHECK-NEXT: ldr d1, [x1]
99 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
100 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
102 %tmp1 = load <8 x i8>, <8 x i8>* %A
103 %tmp2 = load <8 x i8>, <8 x i8>* %B
104 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
105 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
106 %tmp5 = mul <8 x i16> %tmp3, %tmp4
107 %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
111 define <4 x i32> @amull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
112 ; CHECK-LABEL: amull_v4i16_v4i32:
114 ; CHECK-NEXT: ldr d0, [x0]
115 ; CHECK-NEXT: ldr d1, [x1]
116 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
117 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
118 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
120 %tmp1 = load <4 x i16>, <4 x i16>* %A
121 %tmp2 = load <4 x i16>, <4 x i16>* %B
122 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
123 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
124 %tmp5 = mul <4 x i32> %tmp3, %tmp4
125 %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535>
129 define <2 x i64> @amull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
130 ; CHECK-LABEL: amull_v2i32_v2i64:
132 ; CHECK-NEXT: ldr d0, [x0]
133 ; CHECK-NEXT: ldr d1, [x1]
134 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
135 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
136 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
138 %tmp1 = load <2 x i32>, <2 x i32>* %A
139 %tmp2 = load <2 x i32>, <2 x i32>* %B
140 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
141 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
142 %tmp5 = mul <2 x i64> %tmp3, %tmp4
143 %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295>
147 define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
148 ; CHECK-LABEL: smlal_v8i8_v8i16:
150 ; CHECK-NEXT: ldr q0, [x0]
151 ; CHECK-NEXT: ldr d1, [x1]
152 ; CHECK-NEXT: ldr d2, [x2]
153 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
155 %tmp1 = load <8 x i16>, <8 x i16>* %A
156 %tmp2 = load <8 x i8>, <8 x i8>* %B
157 %tmp3 = load <8 x i8>, <8 x i8>* %C
158 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
159 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
160 %tmp6 = mul <8 x i16> %tmp4, %tmp5
161 %tmp7 = add <8 x i16> %tmp1, %tmp6
165 define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
166 ; CHECK-LABEL: smlal_v4i16_v4i32:
168 ; CHECK-NEXT: ldr q0, [x0]
169 ; CHECK-NEXT: ldr d1, [x1]
170 ; CHECK-NEXT: ldr d2, [x2]
171 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
173 %tmp1 = load <4 x i32>, <4 x i32>* %A
174 %tmp2 = load <4 x i16>, <4 x i16>* %B
175 %tmp3 = load <4 x i16>, <4 x i16>* %C
176 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
177 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
178 %tmp6 = mul <4 x i32> %tmp4, %tmp5
179 %tmp7 = add <4 x i32> %tmp1, %tmp6
183 define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
184 ; CHECK-LABEL: smlal_v2i32_v2i64:
186 ; CHECK-NEXT: ldr q0, [x0]
187 ; CHECK-NEXT: ldr d1, [x1]
188 ; CHECK-NEXT: ldr d2, [x2]
189 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
191 %tmp1 = load <2 x i64>, <2 x i64>* %A
192 %tmp2 = load <2 x i32>, <2 x i32>* %B
193 %tmp3 = load <2 x i32>, <2 x i32>* %C
194 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
195 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
196 %tmp6 = mul <2 x i64> %tmp4, %tmp5
197 %tmp7 = add <2 x i64> %tmp1, %tmp6
201 define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
202 ; CHECK-LABEL: umlal_v8i8_v8i16:
204 ; CHECK-NEXT: ldr q0, [x0]
205 ; CHECK-NEXT: ldr d1, [x1]
206 ; CHECK-NEXT: ldr d2, [x2]
207 ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
209 %tmp1 = load <8 x i16>, <8 x i16>* %A
210 %tmp2 = load <8 x i8>, <8 x i8>* %B
211 %tmp3 = load <8 x i8>, <8 x i8>* %C
212 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
213 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
214 %tmp6 = mul <8 x i16> %tmp4, %tmp5
215 %tmp7 = add <8 x i16> %tmp1, %tmp6
219 define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
220 ; CHECK-LABEL: umlal_v4i16_v4i32:
222 ; CHECK-NEXT: ldr q0, [x0]
223 ; CHECK-NEXT: ldr d1, [x1]
224 ; CHECK-NEXT: ldr d2, [x2]
225 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
227 %tmp1 = load <4 x i32>, <4 x i32>* %A
228 %tmp2 = load <4 x i16>, <4 x i16>* %B
229 %tmp3 = load <4 x i16>, <4 x i16>* %C
230 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
231 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
232 %tmp6 = mul <4 x i32> %tmp4, %tmp5
233 %tmp7 = add <4 x i32> %tmp1, %tmp6
237 define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
238 ; CHECK-LABEL: umlal_v2i32_v2i64:
240 ; CHECK-NEXT: ldr q0, [x0]
241 ; CHECK-NEXT: ldr d1, [x1]
242 ; CHECK-NEXT: ldr d2, [x2]
243 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
245 %tmp1 = load <2 x i64>, <2 x i64>* %A
246 %tmp2 = load <2 x i32>, <2 x i32>* %B
247 %tmp3 = load <2 x i32>, <2 x i32>* %C
248 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
249 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
250 %tmp6 = mul <2 x i64> %tmp4, %tmp5
251 %tmp7 = add <2 x i64> %tmp1, %tmp6
255 define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
256 ; CHECK-LABEL: amlal_v8i8_v8i16:
258 ; CHECK-NEXT: ldr q0, [x0]
259 ; CHECK-NEXT: ldr d1, [x1]
260 ; CHECK-NEXT: ldr d2, [x2]
261 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
262 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
264 %tmp1 = load <8 x i16>, <8 x i16>* %A
265 %tmp2 = load <8 x i8>, <8 x i8>* %B
266 %tmp3 = load <8 x i8>, <8 x i8>* %C
267 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
268 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
269 %tmp6 = mul <8 x i16> %tmp4, %tmp5
270 %tmp7 = add <8 x i16> %tmp1, %tmp6
271 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
275 define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
276 ; CHECK-LABEL: amlal_v4i16_v4i32:
278 ; CHECK-NEXT: ldr q0, [x0]
279 ; CHECK-NEXT: ldr d1, [x1]
280 ; CHECK-NEXT: ldr d2, [x2]
281 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
282 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
283 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
285 %tmp1 = load <4 x i32>, <4 x i32>* %A
286 %tmp2 = load <4 x i16>, <4 x i16>* %B
287 %tmp3 = load <4 x i16>, <4 x i16>* %C
288 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
289 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
290 %tmp6 = mul <4 x i32> %tmp4, %tmp5
291 %tmp7 = add <4 x i32> %tmp1, %tmp6
292 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
296 define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
297 ; CHECK-LABEL: amlal_v2i32_v2i64:
299 ; CHECK-NEXT: ldr q0, [x0]
300 ; CHECK-NEXT: ldr d1, [x1]
301 ; CHECK-NEXT: ldr d2, [x2]
302 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
303 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
304 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
306 %tmp1 = load <2 x i64>, <2 x i64>* %A
307 %tmp2 = load <2 x i32>, <2 x i32>* %B
308 %tmp3 = load <2 x i32>, <2 x i32>* %C
309 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
310 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
311 %tmp6 = mul <2 x i64> %tmp4, %tmp5
312 %tmp7 = add <2 x i64> %tmp1, %tmp6
313 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
317 define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
318 ; CHECK-LABEL: smlsl_v8i8_v8i16:
320 ; CHECK-NEXT: ldr q0, [x0]
321 ; CHECK-NEXT: ldr d1, [x1]
322 ; CHECK-NEXT: ldr d2, [x2]
323 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
325 %tmp1 = load <8 x i16>, <8 x i16>* %A
326 %tmp2 = load <8 x i8>, <8 x i8>* %B
327 %tmp3 = load <8 x i8>, <8 x i8>* %C
328 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
329 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
330 %tmp6 = mul <8 x i16> %tmp4, %tmp5
331 %tmp7 = sub <8 x i16> %tmp1, %tmp6
335 define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
336 ; CHECK-LABEL: smlsl_v4i16_v4i32:
338 ; CHECK-NEXT: ldr q0, [x0]
339 ; CHECK-NEXT: ldr d1, [x1]
340 ; CHECK-NEXT: ldr d2, [x2]
341 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
343 %tmp1 = load <4 x i32>, <4 x i32>* %A
344 %tmp2 = load <4 x i16>, <4 x i16>* %B
345 %tmp3 = load <4 x i16>, <4 x i16>* %C
346 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
347 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
348 %tmp6 = mul <4 x i32> %tmp4, %tmp5
349 %tmp7 = sub <4 x i32> %tmp1, %tmp6
353 define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
354 ; CHECK-LABEL: smlsl_v2i32_v2i64:
356 ; CHECK-NEXT: ldr q0, [x0]
357 ; CHECK-NEXT: ldr d1, [x1]
358 ; CHECK-NEXT: ldr d2, [x2]
359 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
361 %tmp1 = load <2 x i64>, <2 x i64>* %A
362 %tmp2 = load <2 x i32>, <2 x i32>* %B
363 %tmp3 = load <2 x i32>, <2 x i32>* %C
364 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
365 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
366 %tmp6 = mul <2 x i64> %tmp4, %tmp5
367 %tmp7 = sub <2 x i64> %tmp1, %tmp6
371 define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
372 ; CHECK-LABEL: umlsl_v8i8_v8i16:
374 ; CHECK-NEXT: ldr q0, [x0]
375 ; CHECK-NEXT: ldr d1, [x1]
376 ; CHECK-NEXT: ldr d2, [x2]
377 ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
379 %tmp1 = load <8 x i16>, <8 x i16>* %A
380 %tmp2 = load <8 x i8>, <8 x i8>* %B
381 %tmp3 = load <8 x i8>, <8 x i8>* %C
382 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
383 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
384 %tmp6 = mul <8 x i16> %tmp4, %tmp5
385 %tmp7 = sub <8 x i16> %tmp1, %tmp6
389 define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
390 ; CHECK-LABEL: umlsl_v4i16_v4i32:
392 ; CHECK-NEXT: ldr q0, [x0]
393 ; CHECK-NEXT: ldr d1, [x1]
394 ; CHECK-NEXT: ldr d2, [x2]
395 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
397 %tmp1 = load <4 x i32>, <4 x i32>* %A
398 %tmp2 = load <4 x i16>, <4 x i16>* %B
399 %tmp3 = load <4 x i16>, <4 x i16>* %C
400 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
401 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
402 %tmp6 = mul <4 x i32> %tmp4, %tmp5
403 %tmp7 = sub <4 x i32> %tmp1, %tmp6
407 define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
408 ; CHECK-LABEL: umlsl_v2i32_v2i64:
410 ; CHECK-NEXT: ldr q0, [x0]
411 ; CHECK-NEXT: ldr d1, [x1]
412 ; CHECK-NEXT: ldr d2, [x2]
413 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
415 %tmp1 = load <2 x i64>, <2 x i64>* %A
416 %tmp2 = load <2 x i32>, <2 x i32>* %B
417 %tmp3 = load <2 x i32>, <2 x i32>* %C
418 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
419 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
420 %tmp6 = mul <2 x i64> %tmp4, %tmp5
421 %tmp7 = sub <2 x i64> %tmp1, %tmp6
425 define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
426 ; CHECK-LABEL: amlsl_v8i8_v8i16:
428 ; CHECK-NEXT: ldr q0, [x0]
429 ; CHECK-NEXT: ldr d1, [x1]
430 ; CHECK-NEXT: ldr d2, [x2]
431 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
432 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
434 %tmp1 = load <8 x i16>, <8 x i16>* %A
435 %tmp2 = load <8 x i8>, <8 x i8>* %B
436 %tmp3 = load <8 x i8>, <8 x i8>* %C
437 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
438 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
439 %tmp6 = mul <8 x i16> %tmp4, %tmp5
440 %tmp7 = sub <8 x i16> %tmp1, %tmp6
441 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
445 define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
446 ; CHECK-LABEL: amlsl_v4i16_v4i32:
448 ; CHECK-NEXT: ldr q0, [x0]
449 ; CHECK-NEXT: ldr d1, [x1]
450 ; CHECK-NEXT: ldr d2, [x2]
451 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
452 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
453 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
455 %tmp1 = load <4 x i32>, <4 x i32>* %A
456 %tmp2 = load <4 x i16>, <4 x i16>* %B
457 %tmp3 = load <4 x i16>, <4 x i16>* %C
458 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
459 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
460 %tmp6 = mul <4 x i32> %tmp4, %tmp5
461 %tmp7 = sub <4 x i32> %tmp1, %tmp6
462 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
466 define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
467 ; CHECK-LABEL: amlsl_v2i32_v2i64:
469 ; CHECK-NEXT: ldr q0, [x0]
470 ; CHECK-NEXT: ldr d1, [x1]
471 ; CHECK-NEXT: ldr d2, [x2]
472 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
473 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
474 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
476 %tmp1 = load <2 x i64>, <2 x i64>* %A
477 %tmp2 = load <2 x i32>, <2 x i32>* %B
478 %tmp3 = load <2 x i32>, <2 x i32>* %C
479 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
480 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
481 %tmp6 = mul <2 x i64> %tmp4, %tmp5
482 %tmp7 = sub <2 x i64> %tmp1, %tmp6
483 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
487 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
488 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
489 ; CHECK-LABEL: smull_extvec_v8i8_v8i16:
491 ; CHECK-NEXT: movi v1.8b, #244
492 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
494 %tmp3 = sext <8 x i8> %arg to <8 x i16>
495 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
499 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
500 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
501 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
503 ; CHECK-NEXT: mov w8, #64537
504 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
505 ; CHECK-NEXT: dup v1.8h, w8
506 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
508 %tmp3 = sext <8 x i8> %arg to <8 x i16>
509 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
513 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
514 ; CHECK-LABEL: smull_extvec_v4i16_v4i32:
516 ; CHECK-NEXT: mvni v1.4h, #11
517 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
519 %tmp3 = sext <4 x i16> %arg to <4 x i32>
520 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
524 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
525 ; CHECK-LABEL: smull_extvec_v2i32_v2i64:
527 ; CHECK-NEXT: mov w8, #-1234
528 ; CHECK-NEXT: dup v1.2s, w8
529 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
531 %tmp3 = sext <2 x i32> %arg to <2 x i64>
532 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
536 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
537 ; CHECK-LABEL: umull_extvec_v8i8_v8i16:
539 ; CHECK-NEXT: movi v1.8b, #12
540 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
542 %tmp3 = zext <8 x i8> %arg to <8 x i16>
543 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
547 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
548 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
549 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
551 ; CHECK-NEXT: mov w8, #999
552 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
553 ; CHECK-NEXT: dup v1.8h, w8
554 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
556 %tmp3 = zext <8 x i8> %arg to <8 x i16>
557 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
561 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
562 ; CHECK-LABEL: umull_extvec_v4i16_v4i32:
564 ; CHECK-NEXT: mov w8, #1234
565 ; CHECK-NEXT: dup v1.4h, w8
566 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
568 %tmp3 = zext <4 x i16> %arg to <4 x i32>
569 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
573 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
574 ; CHECK-LABEL: umull_extvec_v2i32_v2i64:
576 ; CHECK-NEXT: mov w8, #1234
577 ; CHECK-NEXT: dup v1.2s, w8
578 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
580 %tmp3 = zext <2 x i32> %arg to <2 x i64>
581 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
585 define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
586 ; CHECK-LABEL: amull_extvec_v8i8_v8i16:
588 ; CHECK-NEXT: movi v1.8b, #12
589 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
590 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
592 %tmp3 = zext <8 x i8> %arg to <8 x i16>
593 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
594 %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
598 define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
599 ; CHECK-LABEL: amull_extvec_v4i16_v4i32:
601 ; CHECK-NEXT: mov w8, #1234
602 ; CHECK-NEXT: dup v1.4h, w8
603 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
604 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
605 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
607 %tmp3 = zext <4 x i16> %arg to <4 x i32>
608 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
609 %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
613 define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
614 ; CHECK-LABEL: amull_extvec_v2i32_v2i64:
616 ; CHECK-NEXT: mov w8, #1234
617 ; CHECK-NEXT: dup v1.2s, w8
618 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
619 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
620 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
622 %tmp3 = zext <2 x i32> %arg to <2 x i64>
623 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
624 %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
628 define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
629 ; If one operand has a zero-extend and the other a sign-extend, smull
631 ; CHECK-LABEL: smullWithInconsistentExtensions:
633 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
634 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
635 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
636 ; CHECK-NEXT: umov w0, v0.h[0]
638 %s = sext <8 x i8> %x to <8 x i16>
639 %z = zext <8 x i8> %y to <8 x i16>
640 %m = mul <8 x i16> %s, %z
641 %r = extractelement <8 x i16> %m, i32 0
645 define void @distribute(<8 x i16>* %dst, <16 x i8>* %src, i32 %mul) nounwind {
646 ; CHECK-LABEL: distribute:
647 ; CHECK: // %bb.0: // %entry
648 ; CHECK-NEXT: ldr q0, [x1]
649 ; CHECK-NEXT: dup v1.8b, w2
650 ; CHECK-NEXT: mov d2, v0.d[1]
651 ; CHECK-NEXT: umull v2.8h, v2.8b, v1.8b
652 ; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
653 ; CHECK-NEXT: str q2, [x0]
656 %0 = trunc i32 %mul to i8
657 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
658 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
659 %3 = load <16 x i8>, <16 x i8>* %src, align 1
660 %4 = bitcast <16 x i8> %3 to <2 x double>
661 %5 = extractelement <2 x double> %4, i32 1
662 %6 = bitcast double %5 to <8 x i8>
663 %7 = zext <8 x i8> %6 to <8 x i16>
664 %8 = zext <8 x i8> %2 to <8 x i16>
665 %9 = extractelement <2 x double> %4, i32 0
666 %10 = bitcast double %9 to <8 x i8>
667 %11 = zext <8 x i8> %10 to <8 x i16>
668 %12 = add <8 x i16> %7, %11
669 %13 = mul <8 x i16> %12, %8
670 store <8 x i16> %13, <8 x i16>* %dst, align 2
674 define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
675 ; CHECK-LABEL: umull2_i8:
677 ; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
678 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
679 ; CHECK-NEXT: mov v1.16b, v2.16b
681 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
682 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
683 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
687 define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
688 ; CHECK-LABEL: smull2_i8:
690 ; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
691 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
692 ; CHECK-NEXT: mov v1.16b, v2.16b
694 %arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
695 %arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
696 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
700 define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
701 ; CHECK-LABEL: umull2_i16:
703 ; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
704 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
705 ; CHECK-NEXT: mov v1.16b, v2.16b
707 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
708 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
709 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
713 define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
714 ; CHECK-LABEL: smull2_i16:
716 ; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
717 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
718 ; CHECK-NEXT: mov v1.16b, v2.16b
720 %arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
721 %arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
722 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
726 define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
727 ; CHECK-LABEL: umull2_i32:
729 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
730 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
731 ; CHECK-NEXT: mov v1.16b, v2.16b
733 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
734 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
735 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
739 define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
740 ; CHECK-LABEL: smull2_i32:
742 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
743 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
744 ; CHECK-NEXT: mov v1.16b, v2.16b
746 %arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
747 %arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
748 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
752 define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
753 ; CHECK-LABEL: amull2_i8:
755 ; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
756 ; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
757 ; CHECK-NEXT: bic v2.8h, #255, lsl #8
758 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
759 ; CHECK-NEXT: mov v0.16b, v2.16b
761 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
762 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
763 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
764 %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
768 define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
769 ; CHECK-LABEL: amull2_i16:
771 ; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h
772 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
773 ; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
774 ; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
775 ; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
777 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
778 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
779 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
780 %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
784 define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
785 ; CHECK-LABEL: amull2_i32:
787 ; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
788 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
789 ; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
790 ; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
791 ; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
793 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
794 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
795 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
796 %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>