1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
3 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
5 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
6 ; CHECK-LABEL: smull_v8i8_v8i16:
8 ; CHECK-NEXT: ldr d0, [x0]
9 ; CHECK-NEXT: ldr d1, [x1]
10 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
12 %tmp1 = load <8 x i8>, ptr %A
13 %tmp2 = load <8 x i8>, ptr %B
14 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
15 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
16 %tmp5 = mul <8 x i16> %tmp3, %tmp4
20 define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
21 ; CHECK-LABEL: smull_v4i16_v4i32:
23 ; CHECK-NEXT: ldr d0, [x0]
24 ; CHECK-NEXT: ldr d1, [x1]
25 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
27 %tmp1 = load <4 x i16>, ptr %A
28 %tmp2 = load <4 x i16>, ptr %B
29 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
30 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
31 %tmp5 = mul <4 x i32> %tmp3, %tmp4
35 define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
36 ; CHECK-LABEL: smull_v2i32_v2i64:
38 ; CHECK-NEXT: ldr d0, [x0]
39 ; CHECK-NEXT: ldr d1, [x1]
40 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
42 %tmp1 = load <2 x i32>, ptr %A
43 %tmp2 = load <2 x i32>, ptr %B
44 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
45 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
46 %tmp5 = mul <2 x i64> %tmp3, %tmp4
50 define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
51 ; CHECK-LABEL: smull_zext_v8i8_v8i32:
53 ; CHECK-NEXT: ldr d0, [x0]
54 ; CHECK-NEXT: ldr q2, [x1]
55 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
56 ; CHECK-NEXT: smull2 v1.4s, v0.8h, v2.8h
57 ; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h
59 %load.A = load <8 x i8>, ptr %A
60 %load.B = load <8 x i16>, ptr %B
61 %zext.A = zext <8 x i8> %load.A to <8 x i32>
62 %sext.B = sext <8 x i16> %load.B to <8 x i32>
63 %res = mul <8 x i32> %zext.A, %sext.B
67 define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
68 ; CHECK-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
70 ; CHECK-NEXT: ldr d0, [x1]
71 ; CHECK-NEXT: ldr q2, [x0]
72 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
73 ; CHECK-NEXT: smull2 v1.4s, v2.8h, v0.8h
74 ; CHECK-NEXT: smull v0.4s, v2.4h, v0.4h
76 %load.A = load <8 x i16>, ptr %A
77 %load.B = load <8 x i8>, ptr %B
78 %sext.A = sext <8 x i16> %load.A to <8 x i32>
79 %zext.B = zext <8 x i8> %load.B to <8 x i32>
80 %res = mul <8 x i32> %sext.A, %zext.B
84 define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
85 ; CHECK-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
87 ; CHECK-NEXT: ldr q0, [x0]
88 ; CHECK-NEXT: ldr q1, [x1]
89 ; CHECK-NEXT: orr v0.8h, #128, lsl #8
90 ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
91 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
92 ; CHECK-NEXT: ushll v2.4s, v0.4h, #0
93 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
94 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
95 ; CHECK-NEXT: mul v0.4s, v2.4s, v3.4s
97 %load.A = load <8 x i16>, ptr %A
98 %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
99 %load.B = load <8 x i16>, ptr %B
100 %zext.A = zext <8 x i16> %or.A to <8 x i32>
101 %sext.B = sext <8 x i16> %load.B to <8 x i32>
102 %res = mul <8 x i32> %zext.A, %sext.B
106 define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
107 ; CHECK-LABEL: smull_zext_v4i16_v4i32:
109 ; CHECK-NEXT: ldr s0, [x0]
110 ; CHECK-NEXT: ldr d1, [x1]
111 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
112 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
114 %load.A = load <4 x i8>, ptr %A
115 %load.B = load <4 x i16>, ptr %B
116 %zext.A = zext <4 x i8> %load.A to <4 x i32>
117 %sext.B = sext <4 x i16> %load.B to <4 x i32>
118 %res = mul <4 x i32> %zext.A, %sext.B
122 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
123 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
124 ; CHECK-NEON: // %bb.0:
125 ; CHECK-NEON-NEXT: ldr d0, [x1]
126 ; CHECK-NEON-NEXT: ldrh w9, [x0]
127 ; CHECK-NEON-NEXT: ldrh w10, [x0, #2]
128 ; CHECK-NEON-NEXT: sshll v0.2d, v0.2s, #0
129 ; CHECK-NEON-NEXT: fmov x11, d0
130 ; CHECK-NEON-NEXT: mov x8, v0.d[1]
131 ; CHECK-NEON-NEXT: smull x9, w9, w11
132 ; CHECK-NEON-NEXT: smull x8, w10, w8
133 ; CHECK-NEON-NEXT: fmov d0, x9
134 ; CHECK-NEON-NEXT: mov v0.d[1], x8
135 ; CHECK-NEON-NEXT: ret
137 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
138 ; CHECK-SVE: // %bb.0:
139 ; CHECK-SVE-NEXT: ldrh w8, [x0]
140 ; CHECK-SVE-NEXT: ptrue p0.d, vl2
141 ; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
142 ; CHECK-SVE-NEXT: ldr d0, [x1]
143 ; CHECK-SVE-NEXT: fmov d1, x8
144 ; CHECK-SVE-NEXT: sshll v0.2d, v0.2s, #0
145 ; CHECK-SVE-NEXT: mov v1.d[1], x9
146 ; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d
147 ; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
148 ; CHECK-SVE-NEXT: ret
149 %load.A = load <2 x i16>, ptr %A
150 %load.B = load <2 x i32>, ptr %B
151 %zext.A = zext <2 x i16> %load.A to <2 x i64>
152 %sext.B = sext <2 x i32> %load.B to <2 x i64>
153 %res = mul <2 x i64> %zext.A, %sext.B
157 define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
158 ; CHECK-LABEL: smull_zext_and_v2i32_v2i64:
160 ; CHECK-NEXT: ldr d0, [x0]
161 ; CHECK-NEXT: ldr d1, [x1]
162 ; CHECK-NEXT: bic v0.2s, #128, lsl #24
163 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
165 %load.A = load <2 x i32>, ptr %A
166 %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
167 %load.B = load <2 x i32>, ptr %B
168 %zext.A = zext <2 x i32> %and.A to <2 x i64>
169 %sext.B = sext <2 x i32> %load.B to <2 x i64>
170 %res = mul <2 x i64> %zext.A, %sext.B
174 define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
175 ; CHECK-LABEL: umull_v8i8_v8i16:
177 ; CHECK-NEXT: ldr d0, [x0]
178 ; CHECK-NEXT: ldr d1, [x1]
179 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
181 %tmp1 = load <8 x i8>, ptr %A
182 %tmp2 = load <8 x i8>, ptr %B
183 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
184 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
185 %tmp5 = mul <8 x i16> %tmp3, %tmp4
189 define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
190 ; CHECK-LABEL: umull_v4i16_v4i32:
192 ; CHECK-NEXT: ldr d0, [x0]
193 ; CHECK-NEXT: ldr d1, [x1]
194 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
196 %tmp1 = load <4 x i16>, ptr %A
197 %tmp2 = load <4 x i16>, ptr %B
198 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
199 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
200 %tmp5 = mul <4 x i32> %tmp3, %tmp4
204 define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
205 ; CHECK-LABEL: umull_v2i32_v2i64:
207 ; CHECK-NEXT: ldr d0, [x0]
208 ; CHECK-NEXT: ldr d1, [x1]
209 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
211 %tmp1 = load <2 x i32>, ptr %A
212 %tmp2 = load <2 x i32>, ptr %B
213 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
214 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
215 %tmp5 = mul <2 x i64> %tmp3, %tmp4
219 define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
220 ; CHECK-LABEL: amull_v8i8_v8i16:
222 ; CHECK-NEXT: ldr d0, [x0]
223 ; CHECK-NEXT: ldr d1, [x1]
224 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
225 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
227 %tmp1 = load <8 x i8>, ptr %A
228 %tmp2 = load <8 x i8>, ptr %B
229 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
230 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
231 %tmp5 = mul <8 x i16> %tmp3, %tmp4
232 %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
236 define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
237 ; CHECK-LABEL: amull_v4i16_v4i32:
239 ; CHECK-NEXT: ldr d1, [x0]
240 ; CHECK-NEXT: ldr d2, [x1]
241 ; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
242 ; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
243 ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
245 %tmp1 = load <4 x i16>, ptr %A
246 %tmp2 = load <4 x i16>, ptr %B
247 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
248 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
249 %tmp5 = mul <4 x i32> %tmp3, %tmp4
250 %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535>
254 define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
255 ; CHECK-LABEL: amull_v2i32_v2i64:
257 ; CHECK-NEXT: ldr d1, [x0]
258 ; CHECK-NEXT: ldr d2, [x1]
259 ; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
260 ; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
261 ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
263 %tmp1 = load <2 x i32>, ptr %A
264 %tmp2 = load <2 x i32>, ptr %B
265 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
266 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
267 %tmp5 = mul <2 x i64> %tmp3, %tmp4
268 %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295>
272 define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
273 ; CHECK-LABEL: smlal_v8i8_v8i16:
275 ; CHECK-NEXT: ldr q0, [x0]
276 ; CHECK-NEXT: ldr d1, [x1]
277 ; CHECK-NEXT: ldr d2, [x2]
278 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
280 %tmp1 = load <8 x i16>, ptr %A
281 %tmp2 = load <8 x i8>, ptr %B
282 %tmp3 = load <8 x i8>, ptr %C
283 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
284 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
285 %tmp6 = mul <8 x i16> %tmp4, %tmp5
286 %tmp7 = add <8 x i16> %tmp1, %tmp6
290 define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
291 ; CHECK-LABEL: smlal_v4i16_v4i32:
293 ; CHECK-NEXT: ldr q0, [x0]
294 ; CHECK-NEXT: ldr d1, [x1]
295 ; CHECK-NEXT: ldr d2, [x2]
296 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
298 %tmp1 = load <4 x i32>, ptr %A
299 %tmp2 = load <4 x i16>, ptr %B
300 %tmp3 = load <4 x i16>, ptr %C
301 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
302 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
303 %tmp6 = mul <4 x i32> %tmp4, %tmp5
304 %tmp7 = add <4 x i32> %tmp1, %tmp6
308 define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
309 ; CHECK-LABEL: smlal_v2i32_v2i64:
311 ; CHECK-NEXT: ldr q0, [x0]
312 ; CHECK-NEXT: ldr d1, [x1]
313 ; CHECK-NEXT: ldr d2, [x2]
314 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
316 %tmp1 = load <2 x i64>, ptr %A
317 %tmp2 = load <2 x i32>, ptr %B
318 %tmp3 = load <2 x i32>, ptr %C
319 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
320 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
321 %tmp6 = mul <2 x i64> %tmp4, %tmp5
322 %tmp7 = add <2 x i64> %tmp1, %tmp6
326 define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
327 ; CHECK-LABEL: umlal_v8i8_v8i16:
329 ; CHECK-NEXT: ldr q0, [x0]
330 ; CHECK-NEXT: ldr d1, [x1]
331 ; CHECK-NEXT: ldr d2, [x2]
332 ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
334 %tmp1 = load <8 x i16>, ptr %A
335 %tmp2 = load <8 x i8>, ptr %B
336 %tmp3 = load <8 x i8>, ptr %C
337 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
338 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
339 %tmp6 = mul <8 x i16> %tmp4, %tmp5
340 %tmp7 = add <8 x i16> %tmp1, %tmp6
344 define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
345 ; CHECK-LABEL: umlal_v4i16_v4i32:
347 ; CHECK-NEXT: ldr q0, [x0]
348 ; CHECK-NEXT: ldr d1, [x1]
349 ; CHECK-NEXT: ldr d2, [x2]
350 ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
352 %tmp1 = load <4 x i32>, ptr %A
353 %tmp2 = load <4 x i16>, ptr %B
354 %tmp3 = load <4 x i16>, ptr %C
355 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
356 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
357 %tmp6 = mul <4 x i32> %tmp4, %tmp5
358 %tmp7 = add <4 x i32> %tmp1, %tmp6
362 define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
363 ; CHECK-LABEL: umlal_v2i32_v2i64:
365 ; CHECK-NEXT: ldr q0, [x0]
366 ; CHECK-NEXT: ldr d1, [x1]
367 ; CHECK-NEXT: ldr d2, [x2]
368 ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
370 %tmp1 = load <2 x i64>, ptr %A
371 %tmp2 = load <2 x i32>, ptr %B
372 %tmp3 = load <2 x i32>, ptr %C
373 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
374 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
375 %tmp6 = mul <2 x i64> %tmp4, %tmp5
376 %tmp7 = add <2 x i64> %tmp1, %tmp6
380 define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
381 ; CHECK-LABEL: amlal_v8i8_v8i16:
383 ; CHECK-NEXT: ldr q0, [x0]
384 ; CHECK-NEXT: ldr d1, [x1]
385 ; CHECK-NEXT: ldr d2, [x2]
386 ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
387 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
389 %tmp1 = load <8 x i16>, ptr %A
390 %tmp2 = load <8 x i8>, ptr %B
391 %tmp3 = load <8 x i8>, ptr %C
392 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
393 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
394 %tmp6 = mul <8 x i16> %tmp4, %tmp5
395 %tmp7 = add <8 x i16> %tmp1, %tmp6
396 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
400 define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
401 ; CHECK-LABEL: amlal_v4i16_v4i32:
403 ; CHECK-NEXT: ldr q0, [x0]
404 ; CHECK-NEXT: ldr d1, [x1]
405 ; CHECK-NEXT: ldr d2, [x2]
406 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
407 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
408 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
410 %tmp1 = load <4 x i32>, ptr %A
411 %tmp2 = load <4 x i16>, ptr %B
412 %tmp3 = load <4 x i16>, ptr %C
413 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
414 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
415 %tmp6 = mul <4 x i32> %tmp4, %tmp5
416 %tmp7 = add <4 x i32> %tmp1, %tmp6
417 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
421 define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
422 ; CHECK-LABEL: amlal_v2i32_v2i64:
424 ; CHECK-NEXT: ldr q0, [x0]
425 ; CHECK-NEXT: ldr d1, [x1]
426 ; CHECK-NEXT: ldr d2, [x2]
427 ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
428 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
429 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
431 %tmp1 = load <2 x i64>, ptr %A
432 %tmp2 = load <2 x i32>, ptr %B
433 %tmp3 = load <2 x i32>, ptr %C
434 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
435 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
436 %tmp6 = mul <2 x i64> %tmp4, %tmp5
437 %tmp7 = add <2 x i64> %tmp1, %tmp6
438 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
442 define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
443 ; CHECK-LABEL: smlsl_v8i8_v8i16:
445 ; CHECK-NEXT: ldr q0, [x0]
446 ; CHECK-NEXT: ldr d1, [x1]
447 ; CHECK-NEXT: ldr d2, [x2]
448 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
450 %tmp1 = load <8 x i16>, ptr %A
451 %tmp2 = load <8 x i8>, ptr %B
452 %tmp3 = load <8 x i8>, ptr %C
453 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
454 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
455 %tmp6 = mul <8 x i16> %tmp4, %tmp5
456 %tmp7 = sub <8 x i16> %tmp1, %tmp6
460 define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
461 ; CHECK-LABEL: smlsl_v4i16_v4i32:
463 ; CHECK-NEXT: ldr q0, [x0]
464 ; CHECK-NEXT: ldr d1, [x1]
465 ; CHECK-NEXT: ldr d2, [x2]
466 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
468 %tmp1 = load <4 x i32>, ptr %A
469 %tmp2 = load <4 x i16>, ptr %B
470 %tmp3 = load <4 x i16>, ptr %C
471 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
472 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
473 %tmp6 = mul <4 x i32> %tmp4, %tmp5
474 %tmp7 = sub <4 x i32> %tmp1, %tmp6
478 define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
479 ; CHECK-LABEL: smlsl_v2i32_v2i64:
481 ; CHECK-NEXT: ldr q0, [x0]
482 ; CHECK-NEXT: ldr d1, [x1]
483 ; CHECK-NEXT: ldr d2, [x2]
484 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
486 %tmp1 = load <2 x i64>, ptr %A
487 %tmp2 = load <2 x i32>, ptr %B
488 %tmp3 = load <2 x i32>, ptr %C
489 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
490 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
491 %tmp6 = mul <2 x i64> %tmp4, %tmp5
492 %tmp7 = sub <2 x i64> %tmp1, %tmp6
496 define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
497 ; CHECK-LABEL: umlsl_v8i8_v8i16:
499 ; CHECK-NEXT: ldr q0, [x0]
500 ; CHECK-NEXT: ldr d1, [x1]
501 ; CHECK-NEXT: ldr d2, [x2]
502 ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
504 %tmp1 = load <8 x i16>, ptr %A
505 %tmp2 = load <8 x i8>, ptr %B
506 %tmp3 = load <8 x i8>, ptr %C
507 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
508 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
509 %tmp6 = mul <8 x i16> %tmp4, %tmp5
510 %tmp7 = sub <8 x i16> %tmp1, %tmp6
514 define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
515 ; CHECK-LABEL: umlsl_v4i16_v4i32:
517 ; CHECK-NEXT: ldr q0, [x0]
518 ; CHECK-NEXT: ldr d1, [x1]
519 ; CHECK-NEXT: ldr d2, [x2]
520 ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
522 %tmp1 = load <4 x i32>, ptr %A
523 %tmp2 = load <4 x i16>, ptr %B
524 %tmp3 = load <4 x i16>, ptr %C
525 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
526 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
527 %tmp6 = mul <4 x i32> %tmp4, %tmp5
528 %tmp7 = sub <4 x i32> %tmp1, %tmp6
532 define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
533 ; CHECK-LABEL: umlsl_v2i32_v2i64:
535 ; CHECK-NEXT: ldr q0, [x0]
536 ; CHECK-NEXT: ldr d1, [x1]
537 ; CHECK-NEXT: ldr d2, [x2]
538 ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
540 %tmp1 = load <2 x i64>, ptr %A
541 %tmp2 = load <2 x i32>, ptr %B
542 %tmp3 = load <2 x i32>, ptr %C
543 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
544 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
545 %tmp6 = mul <2 x i64> %tmp4, %tmp5
546 %tmp7 = sub <2 x i64> %tmp1, %tmp6
550 define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
551 ; CHECK-LABEL: amlsl_v8i8_v8i16:
553 ; CHECK-NEXT: ldr q0, [x0]
554 ; CHECK-NEXT: ldr d1, [x1]
555 ; CHECK-NEXT: ldr d2, [x2]
556 ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
557 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
559 %tmp1 = load <8 x i16>, ptr %A
560 %tmp2 = load <8 x i8>, ptr %B
561 %tmp3 = load <8 x i8>, ptr %C
562 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
563 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
564 %tmp6 = mul <8 x i16> %tmp4, %tmp5
565 %tmp7 = sub <8 x i16> %tmp1, %tmp6
566 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
570 define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
571 ; CHECK-LABEL: amlsl_v4i16_v4i32:
573 ; CHECK-NEXT: ldr q0, [x0]
574 ; CHECK-NEXT: ldr d1, [x1]
575 ; CHECK-NEXT: ldr d2, [x2]
576 ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
577 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
578 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
580 %tmp1 = load <4 x i32>, ptr %A
581 %tmp2 = load <4 x i16>, ptr %B
582 %tmp3 = load <4 x i16>, ptr %C
583 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
584 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
585 %tmp6 = mul <4 x i32> %tmp4, %tmp5
586 %tmp7 = sub <4 x i32> %tmp1, %tmp6
587 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
591 define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
592 ; CHECK-LABEL: amlsl_v2i32_v2i64:
594 ; CHECK-NEXT: ldr q0, [x0]
595 ; CHECK-NEXT: ldr d1, [x1]
596 ; CHECK-NEXT: ldr d2, [x2]
597 ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
598 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
599 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
601 %tmp1 = load <2 x i64>, ptr %A
602 %tmp2 = load <2 x i32>, ptr %B
603 %tmp3 = load <2 x i32>, ptr %C
604 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
605 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
606 %tmp6 = mul <2 x i64> %tmp4, %tmp5
607 %tmp7 = sub <2 x i64> %tmp1, %tmp6
608 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
612 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
613 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
614 ; CHECK-LABEL: smull_extvec_v8i8_v8i16:
616 ; CHECK-NEXT: movi v1.8b, #244
617 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
619 %tmp3 = sext <8 x i8> %arg to <8 x i16>
620 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
624 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
625 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
626 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
628 ; CHECK-NEXT: mov w8, #64537 // =0xfc19
629 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
630 ; CHECK-NEXT: dup v1.8h, w8
631 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
633 %tmp3 = sext <8 x i8> %arg to <8 x i16>
634 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
638 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
639 ; CHECK-LABEL: smull_extvec_v4i16_v4i32:
641 ; CHECK-NEXT: mvni v1.4h, #11
642 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
644 %tmp3 = sext <4 x i16> %arg to <4 x i32>
645 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
649 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
650 ; CHECK-LABEL: smull_extvec_v2i32_v2i64:
652 ; CHECK-NEXT: mov w8, #-1234 // =0xfffffb2e
653 ; CHECK-NEXT: dup v1.2s, w8
654 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
656 %tmp3 = sext <2 x i32> %arg to <2 x i64>
657 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
661 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
662 ; CHECK-LABEL: umull_extvec_v8i8_v8i16:
664 ; CHECK-NEXT: movi v1.8b, #12
665 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
667 %tmp3 = zext <8 x i8> %arg to <8 x i16>
668 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
672 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
673 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
674 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
676 ; CHECK-NEXT: mov w8, #999 // =0x3e7
677 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
678 ; CHECK-NEXT: dup v1.8h, w8
679 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
681 %tmp3 = zext <8 x i8> %arg to <8 x i16>
682 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
686 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
687 ; CHECK-LABEL: umull_extvec_v4i16_v4i32:
689 ; CHECK-NEXT: mov w8, #1234 // =0x4d2
690 ; CHECK-NEXT: dup v1.4h, w8
691 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
693 %tmp3 = zext <4 x i16> %arg to <4 x i32>
694 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
698 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
699 ; CHECK-LABEL: umull_extvec_v2i32_v2i64:
701 ; CHECK-NEXT: mov w8, #1234 // =0x4d2
702 ; CHECK-NEXT: dup v1.2s, w8
703 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
705 %tmp3 = zext <2 x i32> %arg to <2 x i64>
706 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
710 define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
711 ; CHECK-LABEL: amull_extvec_v8i8_v8i16:
713 ; CHECK-NEXT: movi v1.8b, #12
714 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
715 ; CHECK-NEXT: bic v0.8h, #255, lsl #8
717 %tmp3 = zext <8 x i8> %arg to <8 x i16>
718 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
719 %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
723 define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
724 ; CHECK-LABEL: amull_extvec_v4i16_v4i32:
726 ; CHECK-NEXT: mov w8, #1234 // =0x4d2
727 ; CHECK-NEXT: dup v1.4h, w8
728 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
729 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
730 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
732 %tmp3 = zext <4 x i16> %arg to <4 x i32>
733 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
734 %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
738 define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
739 ; CHECK-LABEL: amull_extvec_v2i32_v2i64:
741 ; CHECK-NEXT: mov w8, #1234 // =0x4d2
742 ; CHECK-NEXT: dup v1.2s, w8
743 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
744 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
745 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
747 %tmp3 = zext <2 x i32> %arg to <2 x i64>
748 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
749 %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
753 define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
754 ; If one operand has a zero-extend and the other a sign-extend, smull
756 ; CHECK-LABEL: smullWithInconsistentExtensions:
758 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
759 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
760 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
761 ; CHECK-NEXT: umov w0, v0.h[0]
763 %s = sext <8 x i8> %x to <8 x i16>
764 %z = zext <8 x i8> %y to <8 x i16>
765 %m = mul <8 x i16> %s, %z
766 %r = extractelement <8 x i16> %m, i32 0
770 define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) {
771 ; CHECK-LABEL: smull_extended_vector_operand:
772 ; CHECK: // %bb.0: // %entry
773 ; CHECK-NEXT: movi v1.4s, #139, lsl #8
774 ; CHECK-NEXT: sshll v2.4s, v0.4h, #0
775 ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
776 ; CHECK-NEXT: mul v2.4s, v2.4s, v1.4s
777 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
778 ; CHECK-NEXT: shrn v0.4h, v2.4s, #1
779 ; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1
782 %0 = sext <8 x i16> %v to <8 x i32>
783 %1 = mul <8 x i32> %0, <i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584>
784 %2 = lshr <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
785 %3 = trunc <8 x i32> %2 to <8 x i16>
790 define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
791 ; CHECK-LABEL: distribute:
792 ; CHECK: // %bb.0: // %entry
793 ; CHECK-NEXT: ldr q0, [x1]
794 ; CHECK-NEXT: dup v1.8b, w2
795 ; CHECK-NEXT: mov d2, v0.d[1]
796 ; CHECK-NEXT: umull v2.8h, v2.8b, v1.8b
797 ; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
798 ; CHECK-NEXT: str q2, [x0]
801 %0 = trunc i32 %mul to i8
802 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
803 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
804 %3 = load <16 x i8>, ptr %src, align 1
805 %4 = bitcast <16 x i8> %3 to <2 x double>
806 %5 = extractelement <2 x double> %4, i32 1
807 %6 = bitcast double %5 to <8 x i8>
808 %7 = zext <8 x i8> %6 to <8 x i16>
809 %8 = zext <8 x i8> %2 to <8 x i16>
810 %9 = extractelement <2 x double> %4, i32 0
811 %10 = bitcast double %9 to <8 x i8>
812 %11 = zext <8 x i8> %10 to <8 x i16>
813 %12 = add <8 x i16> %7, %11
814 %13 = mul <8 x i16> %12, %8
815 store <8 x i16> %13, ptr %dst, align 2
819 define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
820 ; CHECK-LABEL: umull2_i8:
822 ; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
823 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
824 ; CHECK-NEXT: mov v1.16b, v2.16b
826 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
827 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
828 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
832 define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
833 ; CHECK-LABEL: smull2_i8:
835 ; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
836 ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
837 ; CHECK-NEXT: mov v1.16b, v2.16b
839 %arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
840 %arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
841 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
845 define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
846 ; CHECK-LABEL: umull2_i16:
848 ; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
849 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
850 ; CHECK-NEXT: mov v1.16b, v2.16b
852 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
853 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
854 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
858 define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
859 ; CHECK-LABEL: smull2_i16:
861 ; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
862 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
863 ; CHECK-NEXT: mov v1.16b, v2.16b
865 %arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
866 %arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
867 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
871 define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
872 ; CHECK-LABEL: umull2_i32:
874 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
875 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
876 ; CHECK-NEXT: mov v1.16b, v2.16b
878 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
879 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
880 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
884 define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
885 ; CHECK-LABEL: smull2_i32:
887 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
888 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
889 ; CHECK-NEXT: mov v1.16b, v2.16b
891 %arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
892 %arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
893 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
897 define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
898 ; CHECK-LABEL: amull2_i8:
900 ; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
901 ; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
902 ; CHECK-NEXT: bic v2.8h, #255, lsl #8
903 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
904 ; CHECK-NEXT: mov v0.16b, v2.16b
906 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
907 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
908 %mul = mul <16 x i16> %arg1_ext, %arg2_ext
909 %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
913 define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
914 ; CHECK-LABEL: amull2_i16:
916 ; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
917 ; CHECK-NEXT: smull v3.4s, v0.4h, v1.4h
918 ; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
919 ; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
920 ; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
922 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
923 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
924 %mul = mul <8 x i32> %arg1_ext, %arg2_ext
925 %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
929 define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
930 ; CHECK-LABEL: amull2_i32:
932 ; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
933 ; CHECK-NEXT: smull v3.2d, v0.2s, v1.2s
934 ; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
935 ; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
936 ; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
938 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
939 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
940 %mul = mul <4 x i64> %arg1_ext, %arg2_ext
941 %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
946 define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
947 ; CHECK-LABEL: umull_and_v8i16:
948 ; CHECK: // %bb.0: // %entry
949 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
950 ; CHECK-NEXT: xtn v1.8b, v1.8h
951 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
954 %in1 = zext <8 x i8> %src1 to <8 x i16>
955 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
956 %out = mul nsw <8 x i16> %in1, %in2
960 define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
961 ; CHECK-LABEL: umull_and_v8i16_c:
962 ; CHECK: // %bb.0: // %entry
963 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
964 ; CHECK-NEXT: xtn v1.8b, v1.8h
965 ; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
968 %in1 = zext <8 x i8> %src1 to <8 x i16>
969 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
970 %out = mul nsw <8 x i16> %in2, %in1
974 define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
975 ; CHECK-LABEL: umull_and256_v8i16:
976 ; CHECK: // %bb.0: // %entry
977 ; CHECK-NEXT: movi v2.8h, #1, lsl #8
978 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
979 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
980 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
983 %in1 = zext <8 x i8> %src1 to <8 x i16>
984 %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
985 %out = mul nsw <8 x i16> %in1, %in2
989 define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
990 ; CHECK-LABEL: umull_andconst_v8i16:
991 ; CHECK: // %bb.0: // %entry
992 ; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
993 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
996 %in1 = zext <8 x i8> %src1 to <8 x i16>
997 %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1001 define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
1002 ; CHECK-LABEL: umull_smaller_v8i16:
1003 ; CHECK: // %bb.0: // %entry
1004 ; CHECK-NEXT: movi v2.8b, #15
1005 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
1006 ; CHECK-NEXT: xtn v1.8b, v1.8h
1007 ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
1008 ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
1011 %in1 = zext <8 x i4> %src1 to <8 x i16>
1012 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1013 %out = mul nsw <8 x i16> %in1, %in2
1017 define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
1018 ; CHECK-LABEL: umull_and_v4i32:
1019 ; CHECK: // %bb.0: // %entry
1020 ; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
1021 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
1022 ; CHECK-NEXT: xtn v1.4h, v1.4s
1023 ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
1026 %in1 = zext <4 x i16> %src1 to <4 x i32>
1027 %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
1028 %out = mul nsw <4 x i32> %in1, %in2
1032 define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
1033 ; CHECK-LABEL: umull_and_v8i32:
1034 ; CHECK: // %bb.0: // %entry
1035 ; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff
1036 ; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
1037 ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
1038 ; CHECK-NEXT: uzp1 v2.8h, v1.8h, v2.8h
1039 ; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
1040 ; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
1043 %in1 = zext <8 x i16> %src1 to <8 x i32>
1044 %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1045 %out = mul nsw <8 x i32> %in1, %in2
1049 define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
1050 ; CHECK-LABEL: umull_and_v8i32_dup:
1051 ; CHECK: // %bb.0: // %entry
1052 ; CHECK-NEXT: and w8, w0, #0xff
1053 ; CHECK-NEXT: dup v2.8h, w8
1054 ; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
1055 ; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
1058 %in1 = zext <8 x i16> %src1 to <8 x i32>
1059 %in2 = and i32 %src2, 255
1060 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0
1061 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1062 %out = mul nsw <8 x i32> %in1, %broadcast.splat
1066 define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
1067 ; CHECK-LABEL: umull_and_v2i64:
1068 ; CHECK: // %bb.0: // %entry
1069 ; CHECK-NEXT: movi v2.2d, #0x000000000000ff
1070 ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
1071 ; CHECK-NEXT: xtn v1.2s, v1.2d
1072 ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
1075 %in1 = zext <2 x i32> %src1 to <2 x i64>
1076 %in2 = and <2 x i64> %src2, <i64 255, i64 255>
1077 %out = mul nsw <2 x i64> %in1, %in2
1081 define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
1082 ; CHECK-LABEL: umull_and_v4i64:
1083 ; CHECK: // %bb.0: // %entry
1084 ; CHECK-NEXT: movi v3.2d, #0x000000000000ff
1085 ; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
1086 ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
1087 ; CHECK-NEXT: uzp1 v2.4s, v1.4s, v2.4s
1088 ; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
1089 ; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
1092 %in1 = zext <4 x i32> %src1 to <4 x i64>
1093 %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
1094 %out = mul nsw <4 x i64> %in1, %in2
1098 define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
1099 ; CHECK-LABEL: umull_and_v4i64_dup:
1100 ; CHECK: // %bb.0: // %entry
1101 ; CHECK-NEXT: and w8, w0, #0xff
1102 ; CHECK-NEXT: dup v2.4s, w8
1103 ; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
1104 ; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
1107 %in1 = zext <4 x i32> %src1 to <4 x i64>
1108 %in2 = and i64 %src2, 255
1109 %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0
1110 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
1111 %out = mul nsw <4 x i64> %in1, %broadcast.splat
1115 define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
1116 ; CHECK-LABEL: pmlsl2_v8i16_uzp1:
1118 ; CHECK-NEXT: ldr q2, [x1, #16]
1119 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
1120 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
1121 ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
1122 ; CHECK-NEXT: str q0, [x0]
1124 %5 = getelementptr inbounds i32, ptr %3, i64 4
1125 %6 = load <8 x i16>, ptr %5, align 4
1126 %7 = trunc <8 x i16> %6 to <8 x i8>
1127 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1128 %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7)
1129 %10 = sub <8 x i16> %1, %9
1130 store <8 x i16> %10, ptr %2, align 16
1134 define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
1135 ; CHECK-LABEL: smlsl2_v8i16_uzp1:
1137 ; CHECK-NEXT: ldr q2, [x1, #16]
1138 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
1139 ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b
1140 ; CHECK-NEXT: str q1, [x0]
1142 %5 = getelementptr inbounds i32, ptr %3, i64 4
1143 %6 = load <8 x i16>, ptr %5, align 4
1144 %7 = trunc <8 x i16> %6 to <8 x i8>
1145 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1146 %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7)
1147 %10 = sub <8 x i16> %1, %9
1148 store <8 x i16> %10, ptr %2, align 16
1152 define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
1153 ; CHECK-LABEL: umlsl2_v8i16_uzp1:
1155 ; CHECK-NEXT: ldr q2, [x1, #16]
1156 ; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
1157 ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b
1158 ; CHECK-NEXT: str q1, [x0]
1160 %5 = getelementptr inbounds i32, ptr %3, i64 4
1161 %6 = load <8 x i16>, ptr %5, align 4
1162 %7 = trunc <8 x i16> %6 to <8 x i8>
1163 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1164 %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7)
1165 %10 = sub <8 x i16> %1, %9
1166 store <8 x i16> %10, ptr %2, align 16
1170 define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
1171 ; CHECK-LABEL: smlsl2_v4i32_uzp1:
1173 ; CHECK-NEXT: ldr q2, [x1, #16]
1174 ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
1175 ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
1176 ; CHECK-NEXT: str q1, [x0]
1178 %5 = getelementptr inbounds i32, ptr %3, i64 4
1179 %6 = load <4 x i32>, ptr %5, align 4
1180 %7 = trunc <4 x i32> %6 to <4 x i16>
1181 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1182 %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
1183 %10 = sub <4 x i32> %1, %9
1184 store <4 x i32> %10, ptr %2, align 16
1188 define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
1189 ; CHECK-LABEL: umlsl2_v4i32_uzp1:
1191 ; CHECK-NEXT: ldr q2, [x1, #16]
1192 ; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
1193 ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h
1194 ; CHECK-NEXT: str q1, [x0]
1196 %5 = getelementptr inbounds i32, ptr %3, i64 4
1197 %6 = load <4 x i32>, ptr %5, align 4
1198 %7 = trunc <4 x i32> %6 to <4 x i16>
1199 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1200 %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7)
1201 %10 = sub <4 x i32> %1, %9
1202 store <4 x i32> %10, ptr %2, align 16
1206 define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
1207 ; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
1208 ; CHECK: // %bb.0: // %entry
1209 ; CHECK-NEXT: ldp q2, q3, [x1]
1210 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
1211 ; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b
1212 ; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
1213 ; CHECK-NEXT: add v0.8h, v3.8h, v0.8h
1214 ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
1215 ; CHECK-NEXT: str q0, [x0]
1218 %5 = load <8 x i16>, ptr %3, align 4
1219 %6 = trunc <8 x i16> %5 to <8 x i8>
1220 %7 = getelementptr inbounds i32, ptr %3, i64 4
1221 %8 = load <8 x i16>, ptr %7, align 4
1222 %9 = trunc <8 x i16> %8 to <8 x i8>
1223 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1224 %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6)
1225 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1226 %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9)
1227 %14 = add <8 x i16> %11, %13
1228 %15 = sub <8 x i16> %1, %14
1229 store <8 x i16> %15, ptr %2, align 16
1233 define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
1234 ; CHECK-LABEL: smlsl_smlsl2_v8i16_uzp1:
1235 ; CHECK: // %bb.0: // %entry
1236 ; CHECK-NEXT: ldp q2, q3, [x1]
1237 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
1238 ; CHECK-NEXT: smlsl v1.8h, v0.8b, v2.8b
1239 ; CHECK-NEXT: smlsl2 v1.8h, v0.16b, v2.16b
1240 ; CHECK-NEXT: str q1, [x0]
1243 %5 = load <8 x i16>, ptr %3, align 4
1244 %6 = trunc <8 x i16> %5 to <8 x i8>
1245 %7 = getelementptr inbounds i32, ptr %3, i64 4
1246 %8 = load <8 x i16>, ptr %7, align 4
1247 %9 = trunc <8 x i16> %8 to <8 x i8>
1248 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1249 %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6)
1250 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1251 %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9)
1252 %14 = add <8 x i16> %11, %13
1253 %15 = sub <8 x i16> %1, %14
1254 store <8 x i16> %15, ptr %2, align 16
1258 define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
1259 ; CHECK-LABEL: umlsl_umlsl2_v8i16_uzp1:
1260 ; CHECK: // %bb.0: // %entry
1261 ; CHECK-NEXT: ldp q2, q3, [x1]
1262 ; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
1263 ; CHECK-NEXT: umlsl v1.8h, v0.8b, v2.8b
1264 ; CHECK-NEXT: umlsl2 v1.8h, v0.16b, v2.16b
1265 ; CHECK-NEXT: str q1, [x0]
1268 %5 = load <8 x i16>, ptr %3, align 4
1269 %6 = trunc <8 x i16> %5 to <8 x i8>
1270 %7 = getelementptr inbounds i32, ptr %3, i64 4
1271 %8 = load <8 x i16>, ptr %7, align 4
1272 %9 = trunc <8 x i16> %8 to <8 x i8>
1273 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1274 %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6)
1275 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1276 %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9)
1277 %14 = add <8 x i16> %11, %13
1278 %15 = sub <8 x i16> %1, %14
1279 store <8 x i16> %15, ptr %2, align 16
1283 define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
1284 ; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1:
1285 ; CHECK: // %bb.0: // %entry
1286 ; CHECK-NEXT: ldp q2, q3, [x1]
1287 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
1288 ; CHECK-NEXT: smlsl v1.4s, v0.4h, v2.4h
1289 ; CHECK-NEXT: smlsl2 v1.4s, v0.8h, v2.8h
1290 ; CHECK-NEXT: str q1, [x0]
1293 %5 = load <4 x i32>, ptr %3, align 4
1294 %6 = trunc <4 x i32> %5 to <4 x i16>
1295 %7 = getelementptr inbounds i32, ptr %3, i64 4
1296 %8 = load <4 x i32>, ptr %7, align 4
1297 %9 = trunc <4 x i32> %8 to <4 x i16>
1298 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1299 %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6)
1300 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1301 %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9)
1302 %14 = add <4 x i32> %11, %13
1303 %15 = sub <4 x i32> %1, %14
1304 store <4 x i32> %15, ptr %2, align 16
1308 define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
1309 ; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1:
1310 ; CHECK: // %bb.0: // %entry
1311 ; CHECK-NEXT: ldp q2, q3, [x1]
1312 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
1313 ; CHECK-NEXT: umlsl v1.4s, v0.4h, v2.4h
1314 ; CHECK-NEXT: umlsl2 v1.4s, v0.8h, v2.8h
1315 ; CHECK-NEXT: str q1, [x0]
1318 %5 = load <4 x i32>, ptr %3, align 4
1319 %6 = trunc <4 x i32> %5 to <4 x i16>
1320 %7 = getelementptr inbounds i32, ptr %3, i64 4
1321 %8 = load <4 x i32>, ptr %7, align 4
1322 %9 = trunc <4 x i32> %8 to <4 x i16>
1323 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1324 %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6)
1325 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1326 %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9)
1327 %14 = add <4 x i32> %11, %13
1328 %15 = sub <4 x i32> %1, %14
1329 store <4 x i32> %15, ptr %2, align 16
1333 define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
1334 ; CHECK-LABEL: do_stuff:
1336 ; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
1337 ; CHECK-NEXT: smull2 v0.2d, v1.4s, v0.4s
1338 ; CHECK-NEXT: xtn v0.2s, v0.2d
1339 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
1341 %bc.1 = bitcast <2 x i64> %1 to <4 x i32>
1342 %trunc.0 = trunc <2 x i64> %0 to <2 x i32>
1343 %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3>
1344 %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
1345 %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0)
1346 %trunc.smull = trunc <2 x i64> %smull to <2 x i32>
1347 %final = add <2 x i32> %trunc.smull, %shuff.lo
1348 ret <2 x i32> %final
1351 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
1352 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
1353 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
1354 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
1355 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
1356 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)